refac: process docs dir

This commit is contained in:
Timothy J. Baek
2024-10-04 17:22:00 -07:00
parent 9ad5ffb8c1
commit a6c797d4c2
5 changed files with 79 additions and 95 deletions

View File

@@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import (
query_doc_with_hybrid_search,
)
from open_webui.apps.webui.models.documents import DocumentForm, Documents
from open_webui.apps.webui.models.files import Files
from open_webui.config import (
BRAVE_SEARCH_API_KEY,
@@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
)
@app.get("/process/dir")
def process_docs_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
try:
if path.is_file() and not path.name.startswith("."):
tags = extract_folders_after_data_docs(path)
filename = path.name
file_content_type = mimetypes.guess_type(path)
with open(path, "rb") as f:
collection_name = calculate_sha256(f)[:63]
loader = Loader(
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(filename, file_content_type[0], str(path))
try:
result = save_docs_to_vector_db(docs, collection_name)
if result:
sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename)
if doc is None:
doc = Documents.insert_new_doc(
user.id,
DocumentForm(
**{
"name": sanitized_filename,
"title": filename,
"collection_name": collection_name,
"filename": filename,
"content": (
json.dumps(
{
"tags": list(
map(
lambda name: {"name": name},
tags,
)
)
}
)
if len(tags)
else "{}"
),
}
),
)
except Exception as e:
log.exception(e)
pass
except Exception as e:
log.exception(e)
return True
class QueryDocForm(BaseModel):
collection_name: str
query: str