diff --git a/.env b/.env index e033553a..afe170ce 100644 --- a/.env +++ b/.env @@ -1,7 +1,7 @@ # settings for OpenAI OPENAI_API_BASE=https://api.openai.com/v1 OPENAI_API_KEY= -OPENAI_CHAT_MODEL=gpt-3.5-turbo +OPENAI_CHAT_MODEL=gpt-4o OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002 # settings for Azure OpenAI diff --git a/flowsettings.py b/flowsettings.py index 8d7f3cc7..ad213927 100644 --- a/flowsettings.py +++ b/flowsettings.py @@ -33,8 +33,21 @@ KH_APP_DATA_DIR.mkdir(parents=True, exist_ok=True) KH_USER_DATA_DIR = KH_APP_DATA_DIR / "user_data" KH_USER_DATA_DIR.mkdir(parents=True, exist_ok=True) -# doc directory -KH_DOC_DIR = this_dir / "docs" +# markdowm output directory +KH_MARKDOWN_OUTPUT_DIR = KH_APP_DATA_DIR / "markdown_cache_dir" +KH_MARKDOWN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# chunks output directory +KH_CHUNKS_OUTPUT_DIR = KH_APP_DATA_DIR / "chunks_cache_dir" +KH_CHUNKS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# zip output directory +KH_ZIP_OUTPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir" +KH_ZIP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# zip input directory +KH_ZIP_INPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir_in" +KH_ZIP_INPUT_DIR.mkdir(parents=True, exist_ok=True) # HF models can be big, let's store them in the app data directory so that it's easier # for users to manage their storage. @@ -42,14 +55,17 @@ KH_DOC_DIR = this_dir / "docs" os.environ["HF_HOME"] = str(KH_APP_DATA_DIR / "huggingface") os.environ["HF_HUB_CACHE"] = str(KH_APP_DATA_DIR / "huggingface") +# doc directory +KH_DOC_DIR = this_dir / "docs" + COHERE_API_KEY = config("COHERE_API_KEY", default="") KH_MODE = "dev" -KH_FEATURE_USER_MANAGEMENT = False +KH_FEATURE_USER_MANAGEMENT = True KH_FEATURE_USER_MANAGEMENT_ADMIN = str( config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin") ) KH_FEATURE_USER_MANAGEMENT_PASSWORD = str( - config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="XsdMbe8zKP8KdeE@") + config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="admin") ) KH_ENABLE_ALEMBIC = False KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}" @@ -83,8 +99,6 @@ if config("AZURE_OPENAI_API_KEY", default="") and config( "timeout": 20, }, "default": False, - "accuracy": 5, - "cost": 5, } if config("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default=""): KH_EMBEDDINGS["azure"] = { @@ -111,25 +125,22 @@ if config("OPENAI_API_KEY", default=""): or "https://api.openai.com/v1", "api_key": config("OPENAI_API_KEY", default=""), "model": config("OPENAI_CHAT_MODEL", default="") or "gpt-3.5-turbo", + "timeout": 20, + }, + "default": True, + } + KH_EMBEDDINGS["openai"] = { + "spec": { + "__type__": "kotaemon.embeddings.OpenAIEmbeddings", + "base_url": config("OPENAI_API_BASE", default="") + or "https://api.openai.com/v1", + "api_key": config("OPENAI_API_KEY", default=""), + "model": config("OPENAI_EMBEDDINGS_MODEL", default="text-embedding-ada-002") + or "text-embedding-ada-002", "timeout": 10, }, - "default": False, + "default": True, } - if len(KH_EMBEDDINGS) < 1: - KH_EMBEDDINGS["openai"] = { - "spec": { - "__type__": "kotaemon.embeddings.OpenAIEmbeddings", - "base_url": config("OPENAI_API_BASE", default="") - or "https://api.openai.com/v1", - "api_key": config("OPENAI_API_KEY", default=""), - "model": config( - "OPENAI_EMBEDDINGS_MODEL", default="text-embedding-ada-002" - ) - or "text-embedding-ada-002", - "timeout": 10, - }, - "default": False, - } if config("LOCAL_MODEL", default=""): KH_LLMS["local"] = { @@ -159,7 +170,12 @@ if len(KH_EMBEDDINGS) < 1: "default": True, } -KH_REASONINGS = ["ktem.reasoning.simple.FullQAPipeline"] +KH_REASONINGS = [ + "ktem.reasoning.simple.FullQAPipeline", + "ktem.reasoning.simple.FullDecomposeQAPipeline", + "ktem.reasoning.react.ReactAgentPipeline", + "ktem.reasoning.rewoo.RewooAgentPipeline", +] KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".format( config("AZURE_OPENAI_ENDPOINT", default=""), config("OPENAI_VISION_DEPLOYMENT_NAME", default="gpt-4-vision"), @@ -169,7 +185,7 @@ KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}". SETTINGS_APP = { "lang": { - "name": "Language", + "name": "UI Language", "value": "en", "choices": [("English", "en"), ("Japanese", "ja")], "component": "dropdown", @@ -197,7 +213,12 @@ KH_INDEX_TYPES = ["ktem.index.file.FileIndex"] KH_INDICES = [ { "name": "File", - "config": {}, + "config": { + "supported_file_types": ( + ".pdf, .xls, .xlsx, .doc, .docx, " ".pptx, .csv, .html, .txt" + ), + "private": False, + }, "index_type": "ktem.index.file.FileIndex", }, ] diff --git a/libs/kotaemon/kotaemon/indices/rankings/cohere.py b/libs/kotaemon/kotaemon/indices/rankings/cohere.py index 4244b58e..e759d6cc 100644 --- a/libs/kotaemon/kotaemon/indices/rankings/cohere.py +++ b/libs/kotaemon/kotaemon/indices/rankings/cohere.py @@ -21,6 +21,10 @@ class CohereReranking(BaseReranking): "Please install Cohere " "`pip install cohere` to use Cohere Reranking" ) + if not self.cohere_api_key: + print("Cohere API key not found. Skipping reranking.") + return documents + cohere_client = cohere.Client(self.cohere_api_key) compressed_docs: list[Document] = [] diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py index f4123d3f..1e1d9020 100644 --- a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -4,7 +4,6 @@ from io import BytesIO from pathlib import Path from typing import Optional -import fitz from PIL import Image from kotaemon.base import Document, Param @@ -29,6 +28,11 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag img: Image.Image suffix = file_path.suffix.lower() if suffix == ".pdf": + try: + import fitz + except ImportError: + raise ImportError("Please install PyMuPDF: 'pip install PyMuPDF'") + doc = fitz.open(file_path) page = doc.load_page(page_number) pm = page.get_pixmap(dpi=150) diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index 567f4ff9..c4e86430 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -218,7 +218,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever): }, "prioritize_table": { "name": "Prioritize table", - "value": True, + "value": False, "choices": [True, False], "component": "checkbox", }, diff --git a/libs/ktem/ktem/reasoning/simple.py b/libs/ktem/ktem/reasoning/simple.py index 5984df9e..f3b47138 100644 --- a/libs/ktem/ktem/reasoning/simple.py +++ b/libs/ktem/ktem/reasoning/simple.py @@ -64,7 +64,7 @@ def find_text(search_span, context): match = SequenceMatcher( None, sentence, context, autojunk=False ).find_longest_match() - if match.size > len(search_span) * 0.6: + if match.size > len(sentence) * 0.6: matches.append((match.b, match.b + match.size)) return matches @@ -879,7 +879,7 @@ class FullQAPipeline(BaseReasoning): }, "highlight_citation": { "name": "Highlight Citation", - "value": False, + "value": True, "component": "checkbox", }, "system_prompt": {