feat: docling support for document preprocessing

This commit is contained in:
Fabio Polito
2025-02-14 12:08:03 +00:00
parent 22c100bb6b
commit 2419ef06a0
6 changed files with 163 additions and 515 deletions

View File

@@ -351,6 +351,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
},
"chunk": {
"text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -403,6 +404,7 @@ class FileConfig(BaseModel):
class ContentExtractionConfig(BaseModel):
engine: str = ""
tika_server_url: Optional[str] = None
docling_server_url: Optional[str] = None
class ChunkParamUpdateForm(BaseModel):
@@ -483,6 +485,9 @@ async def update_rag_config(
request.app.state.config.TIKA_SERVER_URL = (
form_data.content_extraction.tika_server_url
)
request.app.state.config.DOCLING_SERVER_URL = (
form_data.content_extraction.docling_server_url
)
if form_data.chunk is not None:
request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
@@ -559,6 +564,7 @@ async def update_rag_config(
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
},
"chunk": {
"text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -879,6 +885,7 @@ def process_file(
loader = Loader(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(