mirror of
https://github.com/open-webui/open-webui.git
synced 2026-05-18 05:05:09 +02:00
fix(retrieval): offload Loader.load to a worker thread so file uploads stop blocking the event loop (#23705)
Loader.load() dispatches to the underlying langchain document loaders (PyMuPDF, Unstructured, python-docx, Tika, …) which are all synchronous and CPU/IO-bound. process_file() awaited it directly on the event loop, so parsing a non-trivial PDF/DOCX would freeze the entire FastAPI app for the duration of the parse — which is what users experience as "the server hangs whenever I upload a file." Add an `aload()` async wrapper on Loader that runs the sync load on a worker thread via asyncio.to_thread, and update process_file() to await it. The sync API is preserved so existing callers that already run inside run_in_threadpool (e.g. save_docs_to_vector_db) are unaffected. https://claude.ai/code/session_01JSr4NZSskEUQvoJnavVXh8 Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import requests
|
||||
import logging
|
||||
import ftfy
|
||||
@@ -238,6 +239,18 @@ class Loader:
|
||||
|
||||
return [Document(page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata) for doc in docs]
|
||||
|
||||
async def aload(self, filename: str, file_content_type: str, file_path: str) -> list[Document]:
|
||||
"""
|
||||
Async wrapper around `load`.
|
||||
|
||||
Document loaders dispatched by `_get_loader` (PyMuPDF, Unstructured,
|
||||
python-docx, Tika, etc.) are uniformly synchronous and CPU/IO-bound.
|
||||
Calling `load` directly from an async handler would block the event
|
||||
loop for the entire parse — minutes for large PDFs. This offloads
|
||||
the work to a worker thread so the loop stays responsive.
|
||||
"""
|
||||
return await asyncio.to_thread(self.load, filename, file_content_type, file_path)
|
||||
|
||||
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
|
||||
return file_ext in known_source_ext or (
|
||||
file_content_type
|
||||
|
||||
@@ -1646,7 +1646,7 @@ async def process_file(
|
||||
MINERU_API_TIMEOUT=request.app.state.config.MINERU_API_TIMEOUT,
|
||||
MINERU_PARAMS=request.app.state.config.MINERU_PARAMS,
|
||||
)
|
||||
docs = loader.load(file.filename, file.meta.get('content_type'), file_path)
|
||||
docs = await loader.aload(file.filename, file.meta.get('content_type'), file_path)
|
||||
|
||||
docs = [
|
||||
Document(
|
||||
|
||||
Reference in New Issue
Block a user