feat: chunk min size target for md header splitter

Co-Authored-By: Classic298 <27028174+Classic298@users.noreply.github.com>
2026-02-24 12:11:56 +01:00 · 2026-01-03 19:47:29 +04:00
parent e55bf2c2ac
commit c324359580
4 changed files with 134 additions and 7 deletions
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -2886,6 +2886,13 @@ TIKTOKEN_ENCODING_NAME = PersistentConfig(
 CHUNK_SIZE = PersistentConfig(
    "CHUNK_SIZE", "rag.chunk_size", int(os.environ.get("CHUNK_SIZE", "1000"))
 )
+
+CHUNK_MIN_SIZE_TARGET = PersistentConfig(
+    "CHUNK_MIN_SIZE_TARGET",
+    "rag.chunk_min_size_target",
+    int(os.environ.get("CHUNK_MIN_SIZE_TARGET", "0")),
+)
+
 CHUNK_OVERLAP = PersistentConfig(
    "CHUNK_OVERLAP",
    "rag.chunk_overlap",
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -253,6 +253,7 @@ from open_webui.config import (
    RAG_OLLAMA_BASE_URL,
    RAG_OLLAMA_API_KEY,
    CHUNK_OVERLAP,
+    CHUNK_MIN_SIZE_TARGET,
    CHUNK_SIZE,
    CONTENT_EXTRACTION_ENGINE,
    DATALAB_MARKER_API_KEY,
@@ -900,8 +901,10 @@ app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER = (
 app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME

 app.state.config.CHUNK_SIZE = CHUNK_SIZE
+app.state.config.CHUNK_MIN_SIZE_TARGET = CHUNK_MIN_SIZE_TARGET
 app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP

+
 app.state.config.RAG_EMBEDDING_ENGINE = RAG_EMBEDDING_ENGINE
 app.state.config.RAG_EMBEDDING_MODEL = RAG_EMBEDDING_MODEL
 app.state.config.RAG_EMBEDDING_BATCH_SIZE = RAG_EMBEDDING_BATCH_SIZE
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -505,6 +505,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
        "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER,
        "ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER": request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
        "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE,
+        "CHUNK_MIN_SIZE_TARGET": request.app.state.config.CHUNK_MIN_SIZE_TARGET,
        "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP,
        # File upload settings
        "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE,
@@ -699,6 +700,7 @@ class ConfigForm(BaseModel):
    TEXT_SPLITTER: Optional[str] = None
    ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER: Optional[bool] = None
    CHUNK_SIZE: Optional[int] = None
+    CHUNK_MIN_SIZE_TARGET: Optional[int] = None
    CHUNK_OVERLAP: Optional[int] = None

    # File upload settings
@@ -1006,6 +1008,11 @@ async def update_rag_config(
        if form_data.CHUNK_SIZE is not None
        else request.app.state.config.CHUNK_SIZE
    )
+    request.app.state.config.CHUNK_MIN_SIZE_TARGET = (
+        form_data.CHUNK_MIN_SIZE_TARGET
+        if form_data.CHUNK_MIN_SIZE_TARGET is not None
+        else request.app.state.config.CHUNK_MIN_SIZE_TARGET
+    )
    request.app.state.config.CHUNK_OVERLAP = (
        form_data.CHUNK_OVERLAP
        if form_data.CHUNK_OVERLAP is not None
@@ -1205,6 +1212,7 @@ async def update_rag_config(
        # Chunking settings
        "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER,
        "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE,
+        "CHUNK_MIN_SIZE_TARGET": request.app.state.config.CHUNK_MIN_SIZE_TARGET,
        "ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER": request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
        "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP,
        # File upload settings
@@ -1286,6 +1294,85 @@ async def update_rag_config(
 ####################################


+def can_merge_chunks(a: Document, b: Document) -> bool:
+    if a.metadata.get("source") != b.metadata.get("source"):
+        return False
+
+    a_file_id = a.metadata.get("file_id")
+    b_file_id = b.metadata.get("file_id")
+
+    if a_file_id is not None and b_file_id is not None:
+        return a_file_id == b_file_id
+
+    return True
+
+
+def merge_docs_to_target_size(
+    request: Request,
+    chunks: list[Document],
+) -> list[Document]:
+    """
+    Best-effort normalization of chunk sizes.
+
+    Attempts to grow small chunks up to a desired minimum size,
+    without exceeding the maximum size or crossing source/file
+    boundaries.
+    """
+    min_chunk_size_target = request.app.state.config.CHUNK_MIN_SIZE_TARGET
+    max_chunk_size = request.app.state.config.CHUNK_SIZE
+
+    if min_chunk_size_target <= 0:
+        return chunks
+
+    measure_chunk_size = len
+    if request.app.state.config.TEXT_SPLITTER == "token":
+        encoding = tiktoken.get_encoding(
+            str(request.app.state.config.TIKTOKEN_ENCODING_NAME)
+        )
+        measure_chunk_size = lambda text: len(encoding.encode(text))
+
+    processed_chunks: list[Document] = []
+
+    current_chunk: Document | None = None
+    current_content: str = ""
+
+    for next_chunk in chunks:
+        if current_chunk is None:
+            current_chunk = next_chunk
+            current_content = next_chunk.page_content
+            continue  # First chunk initialization
+
+        proposed_content = f"{current_content}\n\n{next_chunk.page_content}"
+
+        can_absorb = (
+            can_merge_chunks(current_chunk, next_chunk)
+            and measure_chunk_size(current_content) < min_chunk_size_target
+            and measure_chunk_size(proposed_content) <= max_chunk_size
+        )
+
+        if can_absorb:
+            current_content = proposed_content
+        else:
+            processed_chunks.append(
+                Document(
+                    page_content=current_content,
+                    metadata={**current_chunk.metadata},
+                )
+            )
+            current_chunk = next_chunk
+            current_content = next_chunk.page_content
+
+    if current_chunk is not None:
+        processed_chunks.append(
+            Document(
+                page_content=current_content,
+                metadata={**current_chunk.metadata},
+            )
+        )
+
+    return processed_chunks
+
+
 def save_docs_to_vector_db(
    request: Request,
    docs,
@@ -1332,7 +1419,6 @@ def save_docs_to_vector_db(
    if split:
        if request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER:
            log.info("Using markdown header text splitter")
-
            # Define headers to split on - covering most common markdown header levels
            markdown_splitter = MarkdownHeaderTextSplitter(
                headers_to_split_on=[
@@ -1361,6 +1447,8 @@ def save_docs_to_vector_db(
                )

            docs = split_docs
+            if request.app.state.config.CHUNK_MIN_SIZE_TARGET > 0:
+                docs = merge_docs_to_target_size(request, docs)

        if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
            text_splitter = RecursiveCharacterTextSplitter(
--- a/src/lib/components/admin/Settings/Documents.svelte
+++ b/src/lib/components/admin/Settings/Documents.svelte
@@ -666,12 +666,12 @@
 								/>
 							</div>

-								<div class="flex w-full mt-2">
-									<SensitiveInput
-										placeholder={$i18n.t('Enter MinerU API Key')}
-										bind:value={RAGConfig.MINERU_API_KEY}
-									/>
-								</div>
+							<div class="flex w-full mt-2">
+								<SensitiveInput
+									placeholder={$i18n.t('Enter MinerU API Key')}
+									bind:value={RAGConfig.MINERU_API_KEY}
+								/>
+							</div>

 							<div class="flex w-full mt-2">
 								<div class="flex-1 flex justify-between">
@@ -798,6 +798,35 @@
 								</div>
 							</div>
 						</div>
+
+						{#if RAGConfig.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER}
+							<div class="  mb-2.5 flex w-full justify-between">
+								<div class=" flex gap-1.5 w-full">
+									<div class="w-full">
+										<div class="self-center text-xs font-medium min-w-fit mb-1">
+											<Tooltip
+												placement="top-start"
+												content={$i18n.t(
+													'Chunks smaller than this threshold will be merged with neighboring chunks when possible. Set to 0 to disable merging.'
+												)}
+											>
+												{$i18n.t('Chunk Min Size Target')}
+											</Tooltip>
+										</div>
+										<div class="self-center">
+											<input
+												class="w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+												type="number"
+												placeholder={$i18n.t('Enter Chunk Min Size Target')}
+												bind:value={RAGConfig.CHUNK_MIN_SIZE_TARGET}
+												autocomplete="off"
+												min="0"
+											/>
+										</div>
+									</div>
+								</div>
+							</div>
+						{/if}
 					{/if}
 				</div>