feat: merge with main

This commit is contained in:
Fabio Polito
2025-03-05 22:04:34 +00:00
372 changed files with 26027 additions and 10944 deletions

View File

@@ -21,6 +21,7 @@ from fastapi import (
APIRouter,
)
from fastapi.middleware.cors import CORSMiddleware
from fastapi.concurrency import run_in_threadpool
from pydantic import BaseModel
import tiktoken
@@ -45,17 +46,20 @@ from open_webui.retrieval.web.utils import get_web_loader
from open_webui.retrieval.web.brave import search_brave
from open_webui.retrieval.web.kagi import search_kagi
from open_webui.retrieval.web.mojeek import search_mojeek
from open_webui.retrieval.web.bocha import search_bocha
from open_webui.retrieval.web.duckduckgo import search_duckduckgo
from open_webui.retrieval.web.google_pse import search_google_pse
from open_webui.retrieval.web.jina_search import search_jina
from open_webui.retrieval.web.searchapi import search_searchapi
from open_webui.retrieval.web.serpapi import search_serpapi
from open_webui.retrieval.web.searxng import search_searxng
from open_webui.retrieval.web.serper import search_serper
from open_webui.retrieval.web.serply import search_serply
from open_webui.retrieval.web.serpstack import search_serpstack
from open_webui.retrieval.web.tavily import search_tavily
from open_webui.retrieval.web.bing import search_bing
from open_webui.retrieval.web.exa import search_exa
from open_webui.retrieval.web.perplexity import search_perplexity
from open_webui.retrieval.utils import (
get_embedding_function,
@@ -347,11 +351,18 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
return {
"status": True,
"pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES,
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
"BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
"enable_google_drive_integration": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
"enable_onedrive_integration": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
"document_intelligence_config": {
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
},
},
"chunk": {
"text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -368,10 +379,12 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"proxy_url": request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
},
"web": {
"web_loader_ssl_verification": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
"ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
"BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
"search": {
"enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH,
"drive": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
"onedrive": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
"engine": request.app.state.config.RAG_WEB_SEARCH_ENGINE,
"searxng_query_url": request.app.state.config.SEARXNG_QUERY_URL,
"google_pse_api_key": request.app.state.config.GOOGLE_PSE_API_KEY,
@@ -379,6 +392,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"brave_search_api_key": request.app.state.config.BRAVE_SEARCH_API_KEY,
"kagi_search_api_key": request.app.state.config.KAGI_SEARCH_API_KEY,
"mojeek_search_api_key": request.app.state.config.MOJEEK_SEARCH_API_KEY,
"bocha_search_api_key": request.app.state.config.BOCHA_SEARCH_API_KEY,
"serpstack_api_key": request.app.state.config.SERPSTACK_API_KEY,
"serpstack_https": request.app.state.config.SERPSTACK_HTTPS,
"serper_api_key": request.app.state.config.SERPER_API_KEY,
@@ -386,11 +400,17 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"tavily_api_key": request.app.state.config.TAVILY_API_KEY,
"searchapi_api_key": request.app.state.config.SEARCHAPI_API_KEY,
"searchapi_engine": request.app.state.config.SEARCHAPI_ENGINE,
"serpapi_api_key": request.app.state.config.SERPAPI_API_KEY,
"serpapi_engine": request.app.state.config.SERPAPI_ENGINE,
"jina_api_key": request.app.state.config.JINA_API_KEY,
"bing_search_v7_endpoint": request.app.state.config.BING_SEARCH_V7_ENDPOINT,
"bing_search_v7_subscription_key": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
"exa_api_key": request.app.state.config.EXA_API_KEY,
"perplexity_api_key": request.app.state.config.PERPLEXITY_API_KEY,
"result_count": request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
"trust_env": request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV,
"concurrent_requests": request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
"domain_filter_list": request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
},
},
}
@@ -401,10 +421,16 @@ class FileConfig(BaseModel):
max_count: Optional[int] = None
class DocumentIntelligenceConfigForm(BaseModel):
endpoint: str
key: str
class ContentExtractionConfig(BaseModel):
engine: str = ""
tika_server_url: Optional[str] = None
docling_server_url: Optional[str] = None
document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
class ChunkParamUpdateForm(BaseModel):
@@ -428,6 +454,7 @@ class WebSearchConfig(BaseModel):
brave_search_api_key: Optional[str] = None
kagi_search_api_key: Optional[str] = None
mojeek_search_api_key: Optional[str] = None
bocha_search_api_key: Optional[str] = None
serpstack_api_key: Optional[str] = None
serpstack_https: Optional[bool] = None
serper_api_key: Optional[str] = None
@@ -435,21 +462,31 @@ class WebSearchConfig(BaseModel):
tavily_api_key: Optional[str] = None
searchapi_api_key: Optional[str] = None
searchapi_engine: Optional[str] = None
serpapi_api_key: Optional[str] = None
serpapi_engine: Optional[str] = None
jina_api_key: Optional[str] = None
bing_search_v7_endpoint: Optional[str] = None
bing_search_v7_subscription_key: Optional[str] = None
exa_api_key: Optional[str] = None
perplexity_api_key: Optional[str] = None
result_count: Optional[int] = None
concurrent_requests: Optional[int] = None
trust_env: Optional[bool] = None
domain_filter_list: Optional[List[str]] = []
class WebConfig(BaseModel):
search: WebSearchConfig
web_loader_ssl_verification: Optional[bool] = None
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
class ConfigUpdateForm(BaseModel):
RAG_FULL_CONTEXT: Optional[bool] = None
BYPASS_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
pdf_extract_images: Optional[bool] = None
enable_google_drive_integration: Optional[bool] = None
enable_onedrive_integration: Optional[bool] = None
file: Optional[FileConfig] = None
content_extraction: Optional[ContentExtractionConfig] = None
chunk: Optional[ChunkParamUpdateForm] = None
@@ -467,18 +504,38 @@ async def update_rag_config(
else request.app.state.config.PDF_EXTRACT_IMAGES
)
request.app.state.config.RAG_FULL_CONTEXT = (
form_data.RAG_FULL_CONTEXT
if form_data.RAG_FULL_CONTEXT is not None
else request.app.state.config.RAG_FULL_CONTEXT
)
request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = (
form_data.BYPASS_EMBEDDING_AND_RETRIEVAL
if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None
else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
)
request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = (
form_data.enable_google_drive_integration
if form_data.enable_google_drive_integration is not None
else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION
)
request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = (
form_data.enable_onedrive_integration
if form_data.enable_onedrive_integration is not None
else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION
)
if form_data.file is not None:
request.app.state.config.FILE_MAX_SIZE = form_data.file.max_size
request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count
if form_data.content_extraction is not None:
log.info(f"Updating text settings: {form_data.content_extraction}")
log.info(
f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}"
)
request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
form_data.content_extraction.engine
)
@@ -488,6 +545,13 @@ async def update_rag_config(
request.app.state.config.DOCLING_SERVER_URL = (
form_data.content_extraction.docling_server_url
)
if form_data.content_extraction.document_intelligence_config is not None:
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
form_data.content_extraction.document_intelligence_config.endpoint
)
request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
form_data.content_extraction.document_intelligence_config.key
)
if form_data.chunk is not None:
request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
@@ -502,11 +566,16 @@ async def update_rag_config(
if form_data.web is not None:
request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
# Note: When UI "Bypass SSL verification for Websites"=True then ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION=False
form_data.web.web_loader_ssl_verification
form_data.web.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
)
request.app.state.config.ENABLE_RAG_WEB_SEARCH = form_data.web.search.enabled
request.app.state.config.RAG_WEB_SEARCH_ENGINE = form_data.web.search.engine
request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
)
request.app.state.config.SEARXNG_QUERY_URL = (
form_data.web.search.searxng_query_url
)
@@ -525,6 +594,9 @@ async def update_rag_config(
request.app.state.config.MOJEEK_SEARCH_API_KEY = (
form_data.web.search.mojeek_search_api_key
)
request.app.state.config.BOCHA_SEARCH_API_KEY = (
form_data.web.search.bocha_search_api_key
)
request.app.state.config.SERPSTACK_API_KEY = (
form_data.web.search.serpstack_api_key
)
@@ -539,6 +611,9 @@ async def update_rag_config(
form_data.web.search.searchapi_engine
)
request.app.state.config.SERPAPI_API_KEY = form_data.web.search.serpapi_api_key
request.app.state.config.SERPAPI_ENGINE = form_data.web.search.serpapi_engine
request.app.state.config.JINA_API_KEY = form_data.web.search.jina_api_key
request.app.state.config.BING_SEARCH_V7_ENDPOINT = (
form_data.web.search.bing_search_v7_endpoint
@@ -547,16 +622,30 @@ async def update_rag_config(
form_data.web.search.bing_search_v7_subscription_key
)
request.app.state.config.EXA_API_KEY = form_data.web.search.exa_api_key
request.app.state.config.PERPLEXITY_API_KEY = (
form_data.web.search.perplexity_api_key
)
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = (
form_data.web.search.result_count
)
request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = (
form_data.web.search.concurrent_requests
)
request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV = (
form_data.web.search.trust_env
)
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = (
form_data.web.search.domain_filter_list
)
return {
"status": True,
"pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES,
"RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
"BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
"file": {
"max_size": request.app.state.config.FILE_MAX_SIZE,
"max_count": request.app.state.config.FILE_MAX_COUNT,
@@ -565,6 +654,10 @@ async def update_rag_config(
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
"document_intelligence_config": {
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
},
},
"chunk": {
"text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -577,7 +670,8 @@ async def update_rag_config(
"translation": request.app.state.YOUTUBE_LOADER_TRANSLATION,
},
"web": {
"web_loader_ssl_verification": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
"ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
"BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
"search": {
"enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH,
"engine": request.app.state.config.RAG_WEB_SEARCH_ENGINE,
@@ -587,18 +681,25 @@ async def update_rag_config(
"brave_search_api_key": request.app.state.config.BRAVE_SEARCH_API_KEY,
"kagi_search_api_key": request.app.state.config.KAGI_SEARCH_API_KEY,
"mojeek_search_api_key": request.app.state.config.MOJEEK_SEARCH_API_KEY,
"bocha_search_api_key": request.app.state.config.BOCHA_SEARCH_API_KEY,
"serpstack_api_key": request.app.state.config.SERPSTACK_API_KEY,
"serpstack_https": request.app.state.config.SERPSTACK_HTTPS,
"serper_api_key": request.app.state.config.SERPER_API_KEY,
"serply_api_key": request.app.state.config.SERPLY_API_KEY,
"serachapi_api_key": request.app.state.config.SEARCHAPI_API_KEY,
"searchapi_engine": request.app.state.config.SEARCHAPI_ENGINE,
"serpapi_api_key": request.app.state.config.SERPAPI_API_KEY,
"serpapi_engine": request.app.state.config.SERPAPI_ENGINE,
"tavily_api_key": request.app.state.config.TAVILY_API_KEY,
"jina_api_key": request.app.state.config.JINA_API_KEY,
"bing_search_v7_endpoint": request.app.state.config.BING_SEARCH_V7_ENDPOINT,
"bing_search_v7_subscription_key": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
"exa_api_key": request.app.state.config.EXA_API_KEY,
"perplexity_api_key": request.app.state.config.PERPLEXITY_API_KEY,
"result_count": request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
"concurrent_requests": request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
"trust_env": request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV,
"domain_filter_list": request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
},
},
}
@@ -666,6 +767,7 @@ def save_docs_to_vector_db(
overwrite: bool = False,
split: bool = True,
add: bool = False,
user=None,
) -> bool:
def _get_docs_info(docs: list[Document]) -> str:
docs_info = set()
@@ -746,7 +848,11 @@ def save_docs_to_vector_db(
# for meta-data so convert them to string.
for metadata in metadatas:
for key, value in metadata.items():
if isinstance(value, datetime):
if (
isinstance(value, datetime)
or isinstance(value, list)
or isinstance(value, dict)
):
metadata[key] = str(value)
try:
@@ -781,7 +887,7 @@ def save_docs_to_vector_db(
)
embeddings = embedding_function(
list(map(lambda x: x.replace("\n", " "), texts))
list(map(lambda x: x.replace("\n", " "), texts)), user=user
)
items = [
@@ -829,7 +935,12 @@ def process_file(
# Update the content in the file
# Usage: /files/{file_id}/data/content/update
VECTOR_DB_CLIENT.delete_collection(collection_name=f"file-{file.id}")
try:
# /files/{file_id}/data/content/update
VECTOR_DB_CLIENT.delete_collection(collection_name=f"file-{file.id}")
except:
# Audio file upload pipeline
pass
docs = [
Document(
@@ -887,6 +998,8 @@ def process_file(
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
)
docs = loader.load(
file.filename, file.meta.get("content_type"), file_path
@@ -929,35 +1042,45 @@ def process_file(
hash = calculate_sha256_string(text_content)
Files.update_file_hash_by_id(file.id, hash)
try:
result = save_docs_to_vector_db(
request,
docs=docs,
collection_name=collection_name,
metadata={
"file_id": file.id,
"name": file.filename,
"hash": hash,
},
add=(True if form_data.collection_name else False),
)
if result:
Files.update_file_metadata_by_id(
file.id,
{
"collection_name": collection_name,
if not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
try:
result = save_docs_to_vector_db(
request,
docs=docs,
collection_name=collection_name,
metadata={
"file_id": file.id,
"name": file.filename,
"hash": hash,
},
add=(True if form_data.collection_name else False),
user=user,
)
return {
"status": True,
"collection_name": collection_name,
"filename": file.filename,
"content": text_content,
}
except Exception as e:
raise e
if result:
Files.update_file_metadata_by_id(
file.id,
{
"collection_name": collection_name,
},
)
return {
"status": True,
"collection_name": collection_name,
"filename": file.filename,
"content": text_content,
}
except Exception as e:
raise e
else:
return {
"status": True,
"collection_name": None,
"filename": file.filename,
"content": text_content,
}
except Exception as e:
log.exception(e)
if "No pandoc was found" in str(e):
@@ -997,7 +1120,7 @@ def process_text(
text_content = form_data.content
log.debug(f"text_content: {text_content}")
result = save_docs_to_vector_db(request, docs, collection_name)
result = save_docs_to_vector_db(request, docs, collection_name, user=user)
if result:
return {
"status": True,
@@ -1030,7 +1153,9 @@ def process_youtube_video(
content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {content}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=True)
save_docs_to_vector_db(
request, docs, collection_name, overwrite=True, user=user
)
return {
"status": True,
@@ -1071,7 +1196,13 @@ def process_web(
content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {content}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=True)
if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
save_docs_to_vector_db(
request, docs, collection_name, overwrite=True, user=user
)
else:
collection_name = None
return {
"status": True,
@@ -1083,6 +1214,7 @@ def process_web(
},
"meta": {
"name": form_data.url,
"source": form_data.url,
},
},
}
@@ -1102,11 +1234,15 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
- BRAVE_SEARCH_API_KEY
- KAGI_SEARCH_API_KEY
- MOJEEK_SEARCH_API_KEY
- BOCHA_SEARCH_API_KEY
- SERPSTACK_API_KEY
- SERPER_API_KEY
- SERPLY_API_KEY
- TAVILY_API_KEY
- EXA_API_KEY
- PERPLEXITY_API_KEY
- SEARCHAPI_API_KEY + SEARCHAPI_ENGINE (by default `google`)
- SERPAPI_API_KEY + SERPAPI_ENGINE (by default `google`)
Args:
query (str): The query to search for
"""
@@ -1168,6 +1304,16 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
)
else:
raise Exception("No MOJEEK_SEARCH_API_KEY found in environment variables")
elif engine == "bocha":
if request.app.state.config.BOCHA_SEARCH_API_KEY:
return search_bocha(
request.app.state.config.BOCHA_SEARCH_API_KEY,
query,
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
raise Exception("No BOCHA_SEARCH_API_KEY found in environment variables")
elif engine == "serpstack":
if request.app.state.config.SERPSTACK_API_KEY:
return search_serpstack(
@@ -1211,6 +1357,7 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
request.app.state.config.TAVILY_API_KEY,
query,
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
raise Exception("No TAVILY_API_KEY found in environment variables")
@@ -1225,6 +1372,17 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
)
else:
raise Exception("No SEARCHAPI_API_KEY found in environment variables")
elif engine == "serpapi":
if request.app.state.config.SERPAPI_API_KEY:
return search_serpapi(
request.app.state.config.SERPAPI_API_KEY,
request.app.state.config.SERPAPI_ENGINE,
query,
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
raise Exception("No SERPAPI_API_KEY found in environment variables")
elif engine == "jina":
return search_jina(
request.app.state.config.JINA_API_KEY,
@@ -1240,12 +1398,26 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
elif engine == "exa":
return search_exa(
request.app.state.config.EXA_API_KEY,
query,
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
elif engine == "perplexity":
return search_perplexity(
request.app.state.config.PERPLEXITY_API_KEY,
query,
request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
raise Exception("No search engine API key found in environment variables")
@router.post("/process/web/search")
def process_web_search(
async def process_web_search(
request: Request, form_data: SearchForm, user=Depends(get_verified_user)
):
try:
@@ -1277,15 +1449,40 @@ def process_web_search(
urls,
verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
trust_env=request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV,
)
docs = loader.load()
save_docs_to_vector_db(request, docs, collection_name, overwrite=True)
docs = await loader.aload()
return {
"status": True,
"collection_name": collection_name,
"filenames": urls,
}
if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
return {
"status": True,
"collection_name": None,
"filenames": urls,
"docs": [
{
"content": doc.page_content,
"metadata": doc.metadata,
}
for doc in docs
],
"loaded_count": len(docs),
}
else:
await run_in_threadpool(
save_docs_to_vector_db,
request,
docs,
collection_name,
overwrite=True,
user=user,
)
return {
"status": True,
"collection_name": collection_name,
"filenames": urls,
"loaded_count": len(docs),
}
except Exception as e:
log.exception(e)
raise HTTPException(
@@ -1313,7 +1510,9 @@ def query_doc_handler(
return query_doc_with_hybrid_search(
collection_name=form_data.collection_name,
query=form_data.query,
embedding_function=request.app.state.EMBEDDING_FUNCTION,
embedding_function=lambda query: request.app.state.EMBEDDING_FUNCTION(
query, user=user
),
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
reranking_function=request.app.state.rf,
r=(
@@ -1321,12 +1520,16 @@ def query_doc_handler(
if form_data.r
else request.app.state.config.RELEVANCE_THRESHOLD
),
user=user,
)
else:
return query_doc(
collection_name=form_data.collection_name,
query_embedding=request.app.state.EMBEDDING_FUNCTION(form_data.query),
query_embedding=request.app.state.EMBEDDING_FUNCTION(
form_data.query, user=user
),
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
user=user,
)
except Exception as e:
log.exception(e)
@@ -1355,7 +1558,9 @@ def query_collection_handler(
return query_collection_with_hybrid_search(
collection_names=form_data.collection_names,
queries=[form_data.query],
embedding_function=request.app.state.EMBEDDING_FUNCTION,
embedding_function=lambda query: request.app.state.EMBEDDING_FUNCTION(
query, user=user
),
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
reranking_function=request.app.state.rf,
r=(
@@ -1368,7 +1573,9 @@ def query_collection_handler(
return query_collection(
collection_names=form_data.collection_names,
queries=[form_data.query],
embedding_function=request.app.state.EMBEDDING_FUNCTION,
embedding_function=lambda query: request.app.state.EMBEDDING_FUNCTION(
query, user=user
),
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
)
@@ -1432,11 +1639,11 @@ def reset_upload_dir(user=Depends(get_admin_user)) -> bool:
elif os.path.isdir(file_path):
shutil.rmtree(file_path) # Remove the directory
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")
log.exception(f"Failed to delete {file_path}. Reason: {e}")
else:
print(f"The directory {folder} does not exist")
log.warning(f"The directory {folder} does not exist")
except Exception as e:
print(f"Failed to process the directory {folder}. Reason: {e}")
log.exception(f"Failed to process the directory {folder}. Reason: {e}")
return True
@@ -1516,6 +1723,7 @@ def process_files_batch(
docs=all_docs,
collection_name=collection_name,
add=True,
user=user,
)
# Update all files with collection name