fix: update some default settings

This commit is contained in:
taprosoft
2024-07-17 15:47:37 +00:00
parent df1f738ab7
commit 4c277d8a85
6 changed files with 59 additions and 30 deletions

2
.env
View File

@@ -1,7 +1,7 @@
# settings for OpenAI
OPENAI_API_BASE=https://api.openai.com/v1
OPENAI_API_KEY=
OPENAI_CHAT_MODEL=gpt-3.5-turbo
OPENAI_CHAT_MODEL=gpt-4o
OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
# settings for Azure OpenAI

View File

@@ -33,8 +33,21 @@ KH_APP_DATA_DIR.mkdir(parents=True, exist_ok=True)
KH_USER_DATA_DIR = KH_APP_DATA_DIR / "user_data"
KH_USER_DATA_DIR.mkdir(parents=True, exist_ok=True)
# doc directory
KH_DOC_DIR = this_dir / "docs"
# markdowm output directory
KH_MARKDOWN_OUTPUT_DIR = KH_APP_DATA_DIR / "markdown_cache_dir"
KH_MARKDOWN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# chunks output directory
KH_CHUNKS_OUTPUT_DIR = KH_APP_DATA_DIR / "chunks_cache_dir"
KH_CHUNKS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# zip output directory
KH_ZIP_OUTPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir"
KH_ZIP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# zip input directory
KH_ZIP_INPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir_in"
KH_ZIP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
# HF models can be big, let's store them in the app data directory so that it's easier
# for users to manage their storage.
@@ -42,14 +55,17 @@ KH_DOC_DIR = this_dir / "docs"
os.environ["HF_HOME"] = str(KH_APP_DATA_DIR / "huggingface")
os.environ["HF_HUB_CACHE"] = str(KH_APP_DATA_DIR / "huggingface")
# doc directory
KH_DOC_DIR = this_dir / "docs"
COHERE_API_KEY = config("COHERE_API_KEY", default="")
KH_MODE = "dev"
KH_FEATURE_USER_MANAGEMENT = False
KH_FEATURE_USER_MANAGEMENT = True
KH_FEATURE_USER_MANAGEMENT_ADMIN = str(
config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin")
)
KH_FEATURE_USER_MANAGEMENT_PASSWORD = str(
config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="XsdMbe8zKP8KdeE@")
config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="admin")
)
KH_ENABLE_ALEMBIC = False
KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"
@@ -83,8 +99,6 @@ if config("AZURE_OPENAI_API_KEY", default="") and config(
"timeout": 20,
},
"default": False,
"accuracy": 5,
"cost": 5,
}
if config("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default=""):
KH_EMBEDDINGS["azure"] = {
@@ -111,25 +125,22 @@ if config("OPENAI_API_KEY", default=""):
or "https://api.openai.com/v1",
"api_key": config("OPENAI_API_KEY", default=""),
"model": config("OPENAI_CHAT_MODEL", default="") or "gpt-3.5-turbo",
"timeout": 20,
},
"default": True,
}
KH_EMBEDDINGS["openai"] = {
"spec": {
"__type__": "kotaemon.embeddings.OpenAIEmbeddings",
"base_url": config("OPENAI_API_BASE", default="")
or "https://api.openai.com/v1",
"api_key": config("OPENAI_API_KEY", default=""),
"model": config("OPENAI_EMBEDDINGS_MODEL", default="text-embedding-ada-002")
or "text-embedding-ada-002",
"timeout": 10,
},
"default": False,
"default": True,
}
if len(KH_EMBEDDINGS) < 1:
KH_EMBEDDINGS["openai"] = {
"spec": {
"__type__": "kotaemon.embeddings.OpenAIEmbeddings",
"base_url": config("OPENAI_API_BASE", default="")
or "https://api.openai.com/v1",
"api_key": config("OPENAI_API_KEY", default=""),
"model": config(
"OPENAI_EMBEDDINGS_MODEL", default="text-embedding-ada-002"
)
or "text-embedding-ada-002",
"timeout": 10,
},
"default": False,
}
if config("LOCAL_MODEL", default=""):
KH_LLMS["local"] = {
@@ -159,7 +170,12 @@ if len(KH_EMBEDDINGS) < 1:
"default": True,
}
KH_REASONINGS = ["ktem.reasoning.simple.FullQAPipeline"]
KH_REASONINGS = [
"ktem.reasoning.simple.FullQAPipeline",
"ktem.reasoning.simple.FullDecomposeQAPipeline",
"ktem.reasoning.react.ReactAgentPipeline",
"ktem.reasoning.rewoo.RewooAgentPipeline",
]
KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".format(
config("AZURE_OPENAI_ENDPOINT", default=""),
config("OPENAI_VISION_DEPLOYMENT_NAME", default="gpt-4-vision"),
@@ -169,7 +185,7 @@ KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".
SETTINGS_APP = {
"lang": {
"name": "Language",
"name": "UI Language",
"value": "en",
"choices": [("English", "en"), ("Japanese", "ja")],
"component": "dropdown",
@@ -197,7 +213,12 @@ KH_INDEX_TYPES = ["ktem.index.file.FileIndex"]
KH_INDICES = [
{
"name": "File",
"config": {},
"config": {
"supported_file_types": (
".pdf, .xls, .xlsx, .doc, .docx, " ".pptx, .csv, .html, .txt"
),
"private": False,
},
"index_type": "ktem.index.file.FileIndex",
},
]

View File

@@ -21,6 +21,10 @@ class CohereReranking(BaseReranking):
"Please install Cohere " "`pip install cohere` to use Cohere Reranking"
)
if not self.cohere_api_key:
print("Cohere API key not found. Skipping reranking.")
return documents
cohere_client = cohere.Client(self.cohere_api_key)
compressed_docs: list[Document] = []

View File

@@ -4,7 +4,6 @@ from io import BytesIO
from pathlib import Path
from typing import Optional
import fitz
from PIL import Image
from kotaemon.base import Document, Param
@@ -29,6 +28,11 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
img: Image.Image
suffix = file_path.suffix.lower()
if suffix == ".pdf":
try:
import fitz
except ImportError:
raise ImportError("Please install PyMuPDF: 'pip install PyMuPDF'")
doc = fitz.open(file_path)
page = doc.load_page(page_number)
pm = page.get_pixmap(dpi=150)

View File

@@ -218,7 +218,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
},
"prioritize_table": {
"name": "Prioritize table",
"value": True,
"value": False,
"choices": [True, False],
"component": "checkbox",
},

View File

@@ -64,7 +64,7 @@ def find_text(search_span, context):
match = SequenceMatcher(
None, sentence, context, autojunk=False
).find_longest_match()
if match.size > len(search_span) * 0.6:
if match.size > len(sentence) * 0.6:
matches.append((match.b, match.b + match.size))
return matches
@@ -879,7 +879,7 @@ class FullQAPipeline(BaseReasoning):
},
"highlight_citation": {
"name": "Highlight Citation",
"value": False,
"value": True,
"component": "checkbox",
},
"system_prompt": {