This commit is contained in:
Timothy J. Baek
2024-09-29 18:55:26 +02:00
parent f2ec020b64
commit 92dd173b27
4 changed files with 98 additions and 74 deletions

View File

@@ -726,7 +726,6 @@ def process_file(
)
docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
text_content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {text_content}")
Files.update_files_metadata_by_id(
@@ -795,10 +794,17 @@ def process_text(
metadata={"name": form_data.name, "created_by": user.id},
)
]
text_content = form_data.content
log.debug(f"text_content: {text_content}")
result = save_docs_to_vector_db(docs, collection_name)
if result:
return {"status": True, "collection_name": collection_name}
return {
"status": True,
"collection_name": collection_name,
"content": text_content,
}
else:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -806,68 +812,6 @@ def process_text(
)
@app.get("/process/dir")
def process_docs_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
try:
if path.is_file() and not path.name.startswith("."):
tags = extract_folders_after_data_docs(path)
filename = path.name
file_content_type = mimetypes.guess_type(path)
with open(path, "rb") as f:
collection_name = calculate_sha256(f)[:63]
loader = Loader(
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(filename, file_content_type[0], str(path))
try:
result = save_docs_to_vector_db(docs, collection_name)
if result:
sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename)
if doc is None:
doc = Documents.insert_new_doc(
user.id,
DocumentForm(
**{
"name": sanitized_filename,
"title": filename,
"collection_name": collection_name,
"filename": filename,
"content": (
json.dumps(
{
"tags": list(
map(
lambda name: {"name": name},
tags,
)
)
}
)
if len(tags)
else "{}"
),
}
),
)
except Exception as e:
log.exception(e)
pass
except Exception as e:
log.exception(e)
return True
@app.post("/process/youtube")
def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
try:
@@ -882,12 +826,15 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
translation=app.state.YOUTUBE_LOADER_TRANSLATION,
)
docs = loader.load()
text_content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {text_content}")
save_docs_to_vector_db(docs, collection_name, overwrite=True)
return {
"status": True,
"collection_name": collection_name,
"filename": form_data.url,
"content": text_content,
}
except Exception as e:
log.exception(e)
@@ -910,12 +857,15 @@ def process_web(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
requests_per_second=app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
)
docs = loader.load()
text_content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {text_content}")
save_docs_to_vector_db(docs, collection_name, overwrite=True)
return {
"status": True,
"collection_name": collection_name,
"filename": form_data.url,
"content": text_content,
}
except Exception as e:
log.exception(e)
@@ -1067,6 +1017,7 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
loader = get_web_loader(urls)
docs = loader.load()
save_docs_to_vector_db(docs, collection_name, overwrite=True)
return {
@@ -1082,6 +1033,68 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
)
@app.get("/process/dir")
def process_docs_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
try:
if path.is_file() and not path.name.startswith("."):
tags = extract_folders_after_data_docs(path)
filename = path.name
file_content_type = mimetypes.guess_type(path)
with open(path, "rb") as f:
collection_name = calculate_sha256(f)[:63]
loader = Loader(
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(filename, file_content_type[0], str(path))
try:
result = save_docs_to_vector_db(docs, collection_name)
if result:
sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename)
if doc is None:
doc = Documents.insert_new_doc(
user.id,
DocumentForm(
**{
"name": sanitized_filename,
"title": filename,
"collection_name": collection_name,
"filename": filename,
"content": (
json.dumps(
{
"tags": list(
map(
lambda name: {"name": name},
tags,
)
)
}
)
if len(tags)
else "{}"
),
}
),
)
except Exception as e:
log.exception(e)
pass
except Exception as e:
log.exception(e)
return True
class QueryDocForm(BaseModel):
collection_name: str
query: str