refac

2025-12-16 11:57:51 +01:00 · 2024-09-29 18:55:26 +02:00
parent f2ec020b64
commit 92dd173b27
4 changed files with 98 additions and 74 deletions
--- a/backend/open_webui/apps/retrieval/main.py
+++ b/backend/open_webui/apps/retrieval/main.py
@@ -726,7 +726,6 @@ def process_file(
        )
        docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
        text_content = " ".join([doc.page_content for doc in docs])
-
        log.debug(f"text_content: {text_content}")

        Files.update_files_metadata_by_id(
@@ -795,10 +794,17 @@ def process_text(
            metadata={"name": form_data.name, "created_by": user.id},
        )
    ]
+    text_content = form_data.content
+    log.debug(f"text_content: {text_content}")
+
    result = save_docs_to_vector_db(docs, collection_name)

    if result:
-        return {"status": True, "collection_name": collection_name}
+        return {
+            "status": True,
+            "collection_name": collection_name,
+            "content": text_content,
+        }
    else:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -806,68 +812,6 @@ def process_text(
        )


-@app.get("/process/dir")
-def process_docs_dir(user=Depends(get_admin_user)):
-    for path in Path(DOCS_DIR).rglob("./**/*"):
-        try:
-            if path.is_file() and not path.name.startswith("."):
-                tags = extract_folders_after_data_docs(path)
-                filename = path.name
-                file_content_type = mimetypes.guess_type(path)
-
-                with open(path, "rb") as f:
-                    collection_name = calculate_sha256(f)[:63]
-
-                loader = Loader(
-                    engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
-                    TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
-                    PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
-                )
-                docs = loader.load(filename, file_content_type[0], str(path))
-
-                try:
-                    result = save_docs_to_vector_db(docs, collection_name)
-
-                    if result:
-                        sanitized_filename = sanitize_filename(filename)
-                        doc = Documents.get_doc_by_name(sanitized_filename)
-
-                        if doc is None:
-                            doc = Documents.insert_new_doc(
-                                user.id,
-                                DocumentForm(
-                                    **{
-                                        "name": sanitized_filename,
-                                        "title": filename,
-                                        "collection_name": collection_name,
-                                        "filename": filename,
-                                        "content": (
-                                            json.dumps(
-                                                {
-                                                    "tags": list(
-                                                        map(
-                                                            lambda name: {"name": name},
-                                                            tags,
-                                                        )
-                                                    )
-                                                }
-                                            )
-                                            if len(tags)
-                                            else "{}"
-                                        ),
-                                    }
-                                ),
-                            )
-                except Exception as e:
-                    log.exception(e)
-                    pass
-
-        except Exception as e:
-            log.exception(e)
-
-    return True
-
-
@app.post("/process/youtube")
 def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
    try:
@@ -882,12 +826,15 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
            translation=app.state.YOUTUBE_LOADER_TRANSLATION,
        )
        docs = loader.load()
+        text_content = " ".join([doc.page_content for doc in docs])
+        log.debug(f"text_content: {text_content}")
        save_docs_to_vector_db(docs, collection_name, overwrite=True)

        return {
            "status": True,
            "collection_name": collection_name,
            "filename": form_data.url,
+            "content": text_content,
        }
    except Exception as e:
        log.exception(e)
@@ -910,12 +857,15 @@ def process_web(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
            requests_per_second=app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
        )
        docs = loader.load()
+        text_content = " ".join([doc.page_content for doc in docs])
+        log.debug(f"text_content: {text_content}")
        save_docs_to_vector_db(docs, collection_name, overwrite=True)

        return {
            "status": True,
            "collection_name": collection_name,
            "filename": form_data.url,
+            "content": text_content,
        }
    except Exception as e:
        log.exception(e)
@@ -1067,6 +1017,7 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):

        loader = get_web_loader(urls)
        docs = loader.load()
+
        save_docs_to_vector_db(docs, collection_name, overwrite=True)

        return {
@@ -1082,6 +1033,68 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
        )


+@app.get("/process/dir")
+def process_docs_dir(user=Depends(get_admin_user)):
+    for path in Path(DOCS_DIR).rglob("./**/*"):
+        try:
+            if path.is_file() and not path.name.startswith("."):
+                tags = extract_folders_after_data_docs(path)
+                filename = path.name
+                file_content_type = mimetypes.guess_type(path)
+
+                with open(path, "rb") as f:
+                    collection_name = calculate_sha256(f)[:63]
+
+                loader = Loader(
+                    engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
+                    TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
+                    PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
+                )
+                docs = loader.load(filename, file_content_type[0], str(path))
+
+                try:
+                    result = save_docs_to_vector_db(docs, collection_name)
+
+                    if result:
+                        sanitized_filename = sanitize_filename(filename)
+                        doc = Documents.get_doc_by_name(sanitized_filename)
+
+                        if doc is None:
+                            doc = Documents.insert_new_doc(
+                                user.id,
+                                DocumentForm(
+                                    **{
+                                        "name": sanitized_filename,
+                                        "title": filename,
+                                        "collection_name": collection_name,
+                                        "filename": filename,
+                                        "content": (
+                                            json.dumps(
+                                                {
+                                                    "tags": list(
+                                                        map(
+                                                            lambda name: {"name": name},
+                                                            tags,
+                                                        )
+                                                    )
+                                                }
+                                            )
+                                            if len(tags)
+                                            else "{}"
+                                        ),
+                                    }
+                                ),
+                            )
+                except Exception as e:
+                    log.exception(e)
+                    pass
+
+        except Exception as e:
+            log.exception(e)
+
+    return True
+
+
 class QueryDocForm(BaseModel):
    collection_name: str
    query: str