enh: RAG full context mode

2025-12-15 19:37:47 +01:00 · 2025-02-18 21:14:58 -08:00
parent a6a7c548d5
commit 81715f6553
6 changed files with 127 additions and 32 deletions
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -1578,6 +1578,12 @@ ENABLE_RAG_HYBRID_SEARCH = PersistentConfig(
    os.environ.get("ENABLE_RAG_HYBRID_SEARCH", "").lower() == "true",
 )

+RAG_FULL_CONTEXT = PersistentConfig(
+    "RAG_FULL_CONTEXT",
+    "rag.full_context",
+    os.getenv("RAG_FULL_CONTEXT", "False").lower() == "true",
+)
+
 RAG_FILE_MAX_COUNT = PersistentConfig(
    "RAG_FILE_MAX_COUNT",
    "rag.file.max_count",
@@ -1929,7 +1935,7 @@ RAG_WEB_SEARCH_CONCURRENT_REQUESTS = PersistentConfig(
 RAG_WEB_LOADER_ENGINE = PersistentConfig(
    "RAG_WEB_LOADER_ENGINE",
    "rag.web.loader.engine",
-    os.environ.get("RAG_WEB_LOADER_ENGINE", "safe_web")
+    os.environ.get("RAG_WEB_LOADER_ENGINE", "safe_web"),
 )

 RAG_WEB_SEARCH_TRUST_ENV = PersistentConfig(
@@ -1941,7 +1947,7 @@ RAG_WEB_SEARCH_TRUST_ENV = PersistentConfig(
 PLAYWRIGHT_WS_URI = PersistentConfig(
    "PLAYWRIGHT_WS_URI",
    "rag.web.loader.engine.playwright.ws.uri",
-    os.environ.get("PLAYWRIGHT_WS_URI", None)
+    os.environ.get("PLAYWRIGHT_WS_URI", None),
 )

 ####################################
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -156,6 +156,7 @@ from open_webui.config import (
    # Retrieval
    RAG_TEMPLATE,
    DEFAULT_RAG_TEMPLATE,
+    RAG_FULL_CONTEXT,
    RAG_EMBEDDING_MODEL,
    RAG_EMBEDDING_MODEL_AUTO_UPDATE,
    RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
@@ -519,6 +520,8 @@ app.state.config.RELEVANCE_THRESHOLD = RAG_RELEVANCE_THRESHOLD
 app.state.config.FILE_MAX_SIZE = RAG_FILE_MAX_SIZE
 app.state.config.FILE_MAX_COUNT = RAG_FILE_MAX_COUNT

+
+app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
 app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
 app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
    ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
--- a/backend/open_webui/retrieval/utils.py
+++ b/backend/open_webui/retrieval/utils.py
@@ -84,6 +84,19 @@ def query_doc(
        raise e


+def get_doc(collection_name: str, user: UserModel = None):
+    try:
+        result = VECTOR_DB_CLIENT.get(collection_name=collection_name)
+
+        if result:
+            log.info(f"query_doc:result {result.ids} {result.metadatas}")
+
+        return result
+    except Exception as e:
+        print(e)
+        raise e
+
+
 def query_doc_with_hybrid_search(
    collection_name: str,
    query: str,
@@ -137,6 +150,24 @@ def query_doc_with_hybrid_search(
        raise e


+def merge_get_results(get_results: list[dict]) -> dict:
+    # Initialize lists to store combined data
+    combined_documents = []
+    combined_metadatas = []
+
+    for data in get_results:
+        combined_documents.extend(data["documents"][0])
+        combined_metadatas.extend(data["metadatas"][0])
+
+    # Create the output dictionary
+    result = {
+        "documents": [combined_documents],
+        "metadatas": [combined_metadatas],
+    }
+
+    return result
+
+
 def merge_and_sort_query_results(
    query_results: list[dict], k: int, reverse: bool = False
 ) -> list[dict]:
@@ -194,6 +225,23 @@ def merge_and_sort_query_results(
    return result


+def get_all_items_from_collections(collection_names: list[str]) -> dict:
+    results = []
+
+    for collection_name in collection_names:
+        if collection_name:
+            try:
+                result = get_doc(collection_name=collection_name)
+                if result is not None:
+                    results.append(result.model_dump())
+            except Exception as e:
+                log.exception(f"Error when querying the collection: {e}")
+        else:
+            pass
+
+    return merge_get_results(results)
+
+
 def query_collection(
    collection_names: list[str],
    queries: list[str],
@@ -311,8 +359,11 @@ def get_sources_from_files(
    reranking_function,
    r,
    hybrid_search,
+    full_context=False,
 ):
-    log.debug(f"files: {files} {queries} {embedding_function} {reranking_function}")
+    log.debug(
+        f"files: {files} {queries} {embedding_function} {reranking_function} {full_context}"
+    )

    extracted_collections = []
    relevant_contexts = []
@@ -350,36 +401,45 @@ def get_sources_from_files(
                log.debug(f"skipping {file} as it has already been extracted")
                continue

-            try:
-                context = None
-                if file.get("type") == "text":
-                    context = file["content"]
-                else:
-                    if hybrid_search:
-                        try:
-                            context = query_collection_with_hybrid_search(
+            if full_context:
+                try:
+                    context = get_all_items_from_collections(collection_names)
+
+                    print("context", context)
+                except Exception as e:
+                    log.exception(e)
+
+            else:
+                try:
+                    context = None
+                    if file.get("type") == "text":
+                        context = file["content"]
+                    else:
+                        if hybrid_search:
+                            try:
+                                context = query_collection_with_hybrid_search(
+                                    collection_names=collection_names,
+                                    queries=queries,
+                                    embedding_function=embedding_function,
+                                    k=k,
+                                    reranking_function=reranking_function,
+                                    r=r,
+                                )
+                            except Exception as e:
+                                log.debug(
+                                    "Error when using hybrid search, using"
+                                    " non hybrid search as fallback."
+                                )
+
+                        if (not hybrid_search) or (context is None):
+                            context = query_collection(
                                collection_names=collection_names,
                                queries=queries,
                                embedding_function=embedding_function,
                                k=k,
-                                reranking_function=reranking_function,
-                                r=r,
                            )
-                        except Exception as e:
-                            log.debug(
-                                "Error when using hybrid search, using"
-                                " non hybrid search as fallback."
-                            )
-
-                    if (not hybrid_search) or (context is None):
-                        context = query_collection(
-                            collection_names=collection_names,
-                            queries=queries,
-                            embedding_function=embedding_function,
-                            k=k,
-                        )
-            except Exception as e:
-                log.exception(e)
+                except Exception as e:
+                    log.exception(e)

            extracted_collections.extend(collection_names)

--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -351,6 +351,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
    return {
        "status": True,
        "pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES,
+        "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
        "enable_google_drive_integration": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
        "content_extraction": {
            "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
@@ -463,6 +464,7 @@ class WebConfig(BaseModel):


 class ConfigUpdateForm(BaseModel):
+    RAG_FULL_CONTEXT: Optional[bool] = None
    pdf_extract_images: Optional[bool] = None
    enable_google_drive_integration: Optional[bool] = None
    file: Optional[FileConfig] = None
@@ -482,6 +484,12 @@ async def update_rag_config(
        else request.app.state.config.PDF_EXTRACT_IMAGES
    )

+    request.app.state.config.RAG_FULL_CONTEXT = (
+        form_data.RAG_FULL_CONTEXT
+        if form_data.RAG_FULL_CONTEXT is not None
+        else request.app.state.config.RAG_FULL_CONTEXT
+    )
+
    request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = (
        form_data.enable_google_drive_integration
        if form_data.enable_google_drive_integration is not None
@@ -588,6 +596,7 @@ async def update_rag_config(
    return {
        "status": True,
        "pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES,
+        "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
        "file": {
            "max_size": request.app.state.config.FILE_MAX_SIZE,
            "max_count": request.app.state.config.FILE_MAX_COUNT,
@@ -1379,7 +1388,7 @@ async def process_web_search(
                docs,
                collection_name,
                overwrite=True,
-                user=user
+                user=user,
            )

            return {
--- a/backend/open_webui/utils/middleware.py
+++ b/backend/open_webui/utils/middleware.py
@@ -344,7 +344,7 @@ async def chat_web_search_handler(
                    "query": searchQuery,
                }
            ),
-            user=user
+            user=user,
        )

        if results:
@@ -560,9 +560,9 @@ async def chat_completion_files_handler(
                        reranking_function=request.app.state.rf,
                        r=request.app.state.config.RELEVANCE_THRESHOLD,
                        hybrid_search=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
+                        full_context=request.app.state.config.RAG_FULL_CONTEXT,
                    ),
                )
-
        except Exception as e:
            log.exception(e)