mirror of
https://github.com/Cinnamon/kotaemon.git
synced 2026-02-23 19:49:37 +01:00
fix: update some default settings
This commit is contained in:
2
.env
2
.env
@@ -1,7 +1,7 @@
|
||||
# settings for OpenAI
|
||||
OPENAI_API_BASE=https://api.openai.com/v1
|
||||
OPENAI_API_KEY=
|
||||
OPENAI_CHAT_MODEL=gpt-3.5-turbo
|
||||
OPENAI_CHAT_MODEL=gpt-4o
|
||||
OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
|
||||
|
||||
# settings for Azure OpenAI
|
||||
|
||||
@@ -33,8 +33,21 @@ KH_APP_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
KH_USER_DATA_DIR = KH_APP_DATA_DIR / "user_data"
|
||||
KH_USER_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# doc directory
|
||||
KH_DOC_DIR = this_dir / "docs"
|
||||
# markdowm output directory
|
||||
KH_MARKDOWN_OUTPUT_DIR = KH_APP_DATA_DIR / "markdown_cache_dir"
|
||||
KH_MARKDOWN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# chunks output directory
|
||||
KH_CHUNKS_OUTPUT_DIR = KH_APP_DATA_DIR / "chunks_cache_dir"
|
||||
KH_CHUNKS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# zip output directory
|
||||
KH_ZIP_OUTPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir"
|
||||
KH_ZIP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# zip input directory
|
||||
KH_ZIP_INPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir_in"
|
||||
KH_ZIP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# HF models can be big, let's store them in the app data directory so that it's easier
|
||||
# for users to manage their storage.
|
||||
@@ -42,14 +55,17 @@ KH_DOC_DIR = this_dir / "docs"
|
||||
os.environ["HF_HOME"] = str(KH_APP_DATA_DIR / "huggingface")
|
||||
os.environ["HF_HUB_CACHE"] = str(KH_APP_DATA_DIR / "huggingface")
|
||||
|
||||
# doc directory
|
||||
KH_DOC_DIR = this_dir / "docs"
|
||||
|
||||
COHERE_API_KEY = config("COHERE_API_KEY", default="")
|
||||
KH_MODE = "dev"
|
||||
KH_FEATURE_USER_MANAGEMENT = False
|
||||
KH_FEATURE_USER_MANAGEMENT = True
|
||||
KH_FEATURE_USER_MANAGEMENT_ADMIN = str(
|
||||
config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin")
|
||||
)
|
||||
KH_FEATURE_USER_MANAGEMENT_PASSWORD = str(
|
||||
config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="XsdMbe8zKP8KdeE@")
|
||||
config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="admin")
|
||||
)
|
||||
KH_ENABLE_ALEMBIC = False
|
||||
KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"
|
||||
@@ -83,8 +99,6 @@ if config("AZURE_OPENAI_API_KEY", default="") and config(
|
||||
"timeout": 20,
|
||||
},
|
||||
"default": False,
|
||||
"accuracy": 5,
|
||||
"cost": 5,
|
||||
}
|
||||
if config("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default=""):
|
||||
KH_EMBEDDINGS["azure"] = {
|
||||
@@ -111,25 +125,22 @@ if config("OPENAI_API_KEY", default=""):
|
||||
or "https://api.openai.com/v1",
|
||||
"api_key": config("OPENAI_API_KEY", default=""),
|
||||
"model": config("OPENAI_CHAT_MODEL", default="") or "gpt-3.5-turbo",
|
||||
"timeout": 20,
|
||||
},
|
||||
"default": True,
|
||||
}
|
||||
KH_EMBEDDINGS["openai"] = {
|
||||
"spec": {
|
||||
"__type__": "kotaemon.embeddings.OpenAIEmbeddings",
|
||||
"base_url": config("OPENAI_API_BASE", default="")
|
||||
or "https://api.openai.com/v1",
|
||||
"api_key": config("OPENAI_API_KEY", default=""),
|
||||
"model": config("OPENAI_EMBEDDINGS_MODEL", default="text-embedding-ada-002")
|
||||
or "text-embedding-ada-002",
|
||||
"timeout": 10,
|
||||
},
|
||||
"default": False,
|
||||
"default": True,
|
||||
}
|
||||
if len(KH_EMBEDDINGS) < 1:
|
||||
KH_EMBEDDINGS["openai"] = {
|
||||
"spec": {
|
||||
"__type__": "kotaemon.embeddings.OpenAIEmbeddings",
|
||||
"base_url": config("OPENAI_API_BASE", default="")
|
||||
or "https://api.openai.com/v1",
|
||||
"api_key": config("OPENAI_API_KEY", default=""),
|
||||
"model": config(
|
||||
"OPENAI_EMBEDDINGS_MODEL", default="text-embedding-ada-002"
|
||||
)
|
||||
or "text-embedding-ada-002",
|
||||
"timeout": 10,
|
||||
},
|
||||
"default": False,
|
||||
}
|
||||
|
||||
if config("LOCAL_MODEL", default=""):
|
||||
KH_LLMS["local"] = {
|
||||
@@ -159,7 +170,12 @@ if len(KH_EMBEDDINGS) < 1:
|
||||
"default": True,
|
||||
}
|
||||
|
||||
KH_REASONINGS = ["ktem.reasoning.simple.FullQAPipeline"]
|
||||
KH_REASONINGS = [
|
||||
"ktem.reasoning.simple.FullQAPipeline",
|
||||
"ktem.reasoning.simple.FullDecomposeQAPipeline",
|
||||
"ktem.reasoning.react.ReactAgentPipeline",
|
||||
"ktem.reasoning.rewoo.RewooAgentPipeline",
|
||||
]
|
||||
KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".format(
|
||||
config("AZURE_OPENAI_ENDPOINT", default=""),
|
||||
config("OPENAI_VISION_DEPLOYMENT_NAME", default="gpt-4-vision"),
|
||||
@@ -169,7 +185,7 @@ KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".
|
||||
|
||||
SETTINGS_APP = {
|
||||
"lang": {
|
||||
"name": "Language",
|
||||
"name": "UI Language",
|
||||
"value": "en",
|
||||
"choices": [("English", "en"), ("Japanese", "ja")],
|
||||
"component": "dropdown",
|
||||
@@ -197,7 +213,12 @@ KH_INDEX_TYPES = ["ktem.index.file.FileIndex"]
|
||||
KH_INDICES = [
|
||||
{
|
||||
"name": "File",
|
||||
"config": {},
|
||||
"config": {
|
||||
"supported_file_types": (
|
||||
".pdf, .xls, .xlsx, .doc, .docx, " ".pptx, .csv, .html, .txt"
|
||||
),
|
||||
"private": False,
|
||||
},
|
||||
"index_type": "ktem.index.file.FileIndex",
|
||||
},
|
||||
]
|
||||
|
||||
@@ -21,6 +21,10 @@ class CohereReranking(BaseReranking):
|
||||
"Please install Cohere " "`pip install cohere` to use Cohere Reranking"
|
||||
)
|
||||
|
||||
if not self.cohere_api_key:
|
||||
print("Cohere API key not found. Skipping reranking.")
|
||||
return documents
|
||||
|
||||
cohere_client = cohere.Client(self.cohere_api_key)
|
||||
compressed_docs: list[Document] = []
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
|
||||
from kotaemon.base import Document, Param
|
||||
@@ -29,6 +28,11 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
|
||||
img: Image.Image
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyMuPDF: 'pip install PyMuPDF'")
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
page = doc.load_page(page_number)
|
||||
pm = page.get_pixmap(dpi=150)
|
||||
|
||||
@@ -218,7 +218,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
|
||||
},
|
||||
"prioritize_table": {
|
||||
"name": "Prioritize table",
|
||||
"value": True,
|
||||
"value": False,
|
||||
"choices": [True, False],
|
||||
"component": "checkbox",
|
||||
},
|
||||
|
||||
@@ -64,7 +64,7 @@ def find_text(search_span, context):
|
||||
match = SequenceMatcher(
|
||||
None, sentence, context, autojunk=False
|
||||
).find_longest_match()
|
||||
if match.size > len(search_span) * 0.6:
|
||||
if match.size > len(sentence) * 0.6:
|
||||
matches.append((match.b, match.b + match.size))
|
||||
|
||||
return matches
|
||||
@@ -879,7 +879,7 @@ class FullQAPipeline(BaseReasoning):
|
||||
},
|
||||
"highlight_citation": {
|
||||
"name": "Highlight Citation",
|
||||
"value": False,
|
||||
"value": True,
|
||||
"component": "checkbox",
|
||||
},
|
||||
"system_prompt": {
|
||||
|
||||
Reference in New Issue
Block a user