From c32435958073cf002d87e78544baa88bc4e15d7f Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Sat, 3 Jan 2026 19:47:29 +0400 Subject: [PATCH] feat: chunk min size target for md header splitter Co-Authored-By: Classic298 <27028174+Classic298@users.noreply.github.com> --- backend/open_webui/config.py | 7 ++ backend/open_webui/main.py | 3 + backend/open_webui/routers/retrieval.py | 90 ++++++++++++++++++- .../admin/Settings/Documents.svelte | 41 +++++++-- 4 files changed, 134 insertions(+), 7 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index dac65f0ccb..a18b0612f3 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2886,6 +2886,13 @@ TIKTOKEN_ENCODING_NAME = PersistentConfig( CHUNK_SIZE = PersistentConfig( "CHUNK_SIZE", "rag.chunk_size", int(os.environ.get("CHUNK_SIZE", "1000")) ) + +CHUNK_MIN_SIZE_TARGET = PersistentConfig( + "CHUNK_MIN_SIZE_TARGET", + "rag.chunk_min_size_target", + int(os.environ.get("CHUNK_MIN_SIZE_TARGET", "0")), +) + CHUNK_OVERLAP = PersistentConfig( "CHUNK_OVERLAP", "rag.chunk_overlap", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 320e5c5cf3..37a6589fe6 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -253,6 +253,7 @@ from open_webui.config import ( RAG_OLLAMA_BASE_URL, RAG_OLLAMA_API_KEY, CHUNK_OVERLAP, + CHUNK_MIN_SIZE_TARGET, CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, DATALAB_MARKER_API_KEY, @@ -900,8 +901,10 @@ app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER = ( app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME app.state.config.CHUNK_SIZE = CHUNK_SIZE +app.state.config.CHUNK_MIN_SIZE_TARGET = CHUNK_MIN_SIZE_TARGET app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP + app.state.config.RAG_EMBEDDING_ENGINE = RAG_EMBEDDING_ENGINE app.state.config.RAG_EMBEDDING_MODEL = RAG_EMBEDDING_MODEL app.state.config.RAG_EMBEDDING_BATCH_SIZE = RAG_EMBEDDING_BATCH_SIZE diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 6f6baedc91..b6743992fa 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -505,6 +505,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER, "ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER": request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER, "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE, + "CHUNK_MIN_SIZE_TARGET": request.app.state.config.CHUNK_MIN_SIZE_TARGET, "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP, # File upload settings "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE, @@ -699,6 +700,7 @@ class ConfigForm(BaseModel): TEXT_SPLITTER: Optional[str] = None ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER: Optional[bool] = None CHUNK_SIZE: Optional[int] = None + CHUNK_MIN_SIZE_TARGET: Optional[int] = None CHUNK_OVERLAP: Optional[int] = None # File upload settings @@ -1006,6 +1008,11 @@ async def update_rag_config( if form_data.CHUNK_SIZE is not None else request.app.state.config.CHUNK_SIZE ) + request.app.state.config.CHUNK_MIN_SIZE_TARGET = ( + form_data.CHUNK_MIN_SIZE_TARGET + if form_data.CHUNK_MIN_SIZE_TARGET is not None + else request.app.state.config.CHUNK_MIN_SIZE_TARGET + ) request.app.state.config.CHUNK_OVERLAP = ( form_data.CHUNK_OVERLAP if form_data.CHUNK_OVERLAP is not None @@ -1205,6 +1212,7 @@ async def update_rag_config( # Chunking settings "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER, "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE, + "CHUNK_MIN_SIZE_TARGET": request.app.state.config.CHUNK_MIN_SIZE_TARGET, "ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER": request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER, "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP, # File upload settings @@ -1286,6 +1294,85 @@ async def update_rag_config( #################################### +def can_merge_chunks(a: Document, b: Document) -> bool: + if a.metadata.get("source") != b.metadata.get("source"): + return False + + a_file_id = a.metadata.get("file_id") + b_file_id = b.metadata.get("file_id") + + if a_file_id is not None and b_file_id is not None: + return a_file_id == b_file_id + + return True + + +def merge_docs_to_target_size( + request: Request, + chunks: list[Document], +) -> list[Document]: + """ + Best-effort normalization of chunk sizes. + + Attempts to grow small chunks up to a desired minimum size, + without exceeding the maximum size or crossing source/file + boundaries. + """ + min_chunk_size_target = request.app.state.config.CHUNK_MIN_SIZE_TARGET + max_chunk_size = request.app.state.config.CHUNK_SIZE + + if min_chunk_size_target <= 0: + return chunks + + measure_chunk_size = len + if request.app.state.config.TEXT_SPLITTER == "token": + encoding = tiktoken.get_encoding( + str(request.app.state.config.TIKTOKEN_ENCODING_NAME) + ) + measure_chunk_size = lambda text: len(encoding.encode(text)) + + processed_chunks: list[Document] = [] + + current_chunk: Document | None = None + current_content: str = "" + + for next_chunk in chunks: + if current_chunk is None: + current_chunk = next_chunk + current_content = next_chunk.page_content + continue # First chunk initialization + + proposed_content = f"{current_content}\n\n{next_chunk.page_content}" + + can_absorb = ( + can_merge_chunks(current_chunk, next_chunk) + and measure_chunk_size(current_content) < min_chunk_size_target + and measure_chunk_size(proposed_content) <= max_chunk_size + ) + + if can_absorb: + current_content = proposed_content + else: + processed_chunks.append( + Document( + page_content=current_content, + metadata={**current_chunk.metadata}, + ) + ) + current_chunk = next_chunk + current_content = next_chunk.page_content + + if current_chunk is not None: + processed_chunks.append( + Document( + page_content=current_content, + metadata={**current_chunk.metadata}, + ) + ) + + return processed_chunks + + def save_docs_to_vector_db( request: Request, docs, @@ -1332,7 +1419,6 @@ def save_docs_to_vector_db( if split: if request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER: log.info("Using markdown header text splitter") - # Define headers to split on - covering most common markdown header levels markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=[ @@ -1361,6 +1447,8 @@ def save_docs_to_vector_db( ) docs = split_docs + if request.app.state.config.CHUNK_MIN_SIZE_TARGET > 0: + docs = merge_docs_to_target_size(request, docs) if request.app.state.config.TEXT_SPLITTER in ["", "character"]: text_splitter = RecursiveCharacterTextSplitter( diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 99916b2036..732d824692 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -666,12 +666,12 @@ /> -
- -
+
+ +
@@ -798,6 +798,35 @@
+ + {#if RAGConfig.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER} +
+
+
+
+ + {$i18n.t('Chunk Min Size Target')} + +
+
+ +
+
+
+
+ {/if} {/if}