diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 57867d78f5..7dc9df37ce 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -1,3 +1,4 @@ +import asyncio import requests import logging import ftfy @@ -238,6 +239,18 @@ class Loader: return [Document(page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata) for doc in docs] + async def aload(self, filename: str, file_content_type: str, file_path: str) -> list[Document]: + """ + Async wrapper around `load`. + + Document loaders dispatched by `_get_loader` (PyMuPDF, Unstructured, + python-docx, Tika, etc.) are uniformly synchronous and CPU/IO-bound. + Calling `load` directly from an async handler would block the event + loop for the entire parse — minutes for large PDFs. This offloads + the work to a worker thread so the loop stays responsive. + """ + return await asyncio.to_thread(self.load, filename, file_content_type, file_path) + def _is_text_file(self, file_ext: str, file_content_type: str) -> bool: return file_ext in known_source_ext or ( file_content_type diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index c0295ff316..c4f6614adc 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -1646,7 +1646,7 @@ async def process_file( MINERU_API_TIMEOUT=request.app.state.config.MINERU_API_TIMEOUT, MINERU_PARAMS=request.app.state.config.MINERU_PARAMS, ) - docs = loader.load(file.filename, file.meta.get('content_type'), file_path) + docs = await loader.aload(file.filename, file.meta.get('content_type'), file_path) docs = [ Document(