mirror of
https://github.com/open-webui/open-webui.git
synced 2026-02-24 12:11:56 +01:00
feat: chunk min size target for md header splitter
Co-Authored-By: Classic298 <27028174+Classic298@users.noreply.github.com>
This commit is contained in:
@@ -2886,6 +2886,13 @@ TIKTOKEN_ENCODING_NAME = PersistentConfig(
|
||||
CHUNK_SIZE = PersistentConfig(
|
||||
"CHUNK_SIZE", "rag.chunk_size", int(os.environ.get("CHUNK_SIZE", "1000"))
|
||||
)
|
||||
|
||||
CHUNK_MIN_SIZE_TARGET = PersistentConfig(
|
||||
"CHUNK_MIN_SIZE_TARGET",
|
||||
"rag.chunk_min_size_target",
|
||||
int(os.environ.get("CHUNK_MIN_SIZE_TARGET", "0")),
|
||||
)
|
||||
|
||||
CHUNK_OVERLAP = PersistentConfig(
|
||||
"CHUNK_OVERLAP",
|
||||
"rag.chunk_overlap",
|
||||
|
||||
@@ -253,6 +253,7 @@ from open_webui.config import (
|
||||
RAG_OLLAMA_BASE_URL,
|
||||
RAG_OLLAMA_API_KEY,
|
||||
CHUNK_OVERLAP,
|
||||
CHUNK_MIN_SIZE_TARGET,
|
||||
CHUNK_SIZE,
|
||||
CONTENT_EXTRACTION_ENGINE,
|
||||
DATALAB_MARKER_API_KEY,
|
||||
@@ -900,8 +901,10 @@ app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER = (
|
||||
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
||||
|
||||
app.state.config.CHUNK_SIZE = CHUNK_SIZE
|
||||
app.state.config.CHUNK_MIN_SIZE_TARGET = CHUNK_MIN_SIZE_TARGET
|
||||
app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP
|
||||
|
||||
|
||||
app.state.config.RAG_EMBEDDING_ENGINE = RAG_EMBEDDING_ENGINE
|
||||
app.state.config.RAG_EMBEDDING_MODEL = RAG_EMBEDDING_MODEL
|
||||
app.state.config.RAG_EMBEDDING_BATCH_SIZE = RAG_EMBEDDING_BATCH_SIZE
|
||||
|
||||
@@ -505,6 +505,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||
"TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER,
|
||||
"ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER": request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
||||
"CHUNK_SIZE": request.app.state.config.CHUNK_SIZE,
|
||||
"CHUNK_MIN_SIZE_TARGET": request.app.state.config.CHUNK_MIN_SIZE_TARGET,
|
||||
"CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP,
|
||||
# File upload settings
|
||||
"FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE,
|
||||
@@ -699,6 +700,7 @@ class ConfigForm(BaseModel):
|
||||
TEXT_SPLITTER: Optional[str] = None
|
||||
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER: Optional[bool] = None
|
||||
CHUNK_SIZE: Optional[int] = None
|
||||
CHUNK_MIN_SIZE_TARGET: Optional[int] = None
|
||||
CHUNK_OVERLAP: Optional[int] = None
|
||||
|
||||
# File upload settings
|
||||
@@ -1006,6 +1008,11 @@ async def update_rag_config(
|
||||
if form_data.CHUNK_SIZE is not None
|
||||
else request.app.state.config.CHUNK_SIZE
|
||||
)
|
||||
request.app.state.config.CHUNK_MIN_SIZE_TARGET = (
|
||||
form_data.CHUNK_MIN_SIZE_TARGET
|
||||
if form_data.CHUNK_MIN_SIZE_TARGET is not None
|
||||
else request.app.state.config.CHUNK_MIN_SIZE_TARGET
|
||||
)
|
||||
request.app.state.config.CHUNK_OVERLAP = (
|
||||
form_data.CHUNK_OVERLAP
|
||||
if form_data.CHUNK_OVERLAP is not None
|
||||
@@ -1205,6 +1212,7 @@ async def update_rag_config(
|
||||
# Chunking settings
|
||||
"TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER,
|
||||
"CHUNK_SIZE": request.app.state.config.CHUNK_SIZE,
|
||||
"CHUNK_MIN_SIZE_TARGET": request.app.state.config.CHUNK_MIN_SIZE_TARGET,
|
||||
"ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER": request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
||||
"CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP,
|
||||
# File upload settings
|
||||
@@ -1286,6 +1294,85 @@ async def update_rag_config(
|
||||
####################################
|
||||
|
||||
|
||||
def can_merge_chunks(a: Document, b: Document) -> bool:
|
||||
if a.metadata.get("source") != b.metadata.get("source"):
|
||||
return False
|
||||
|
||||
a_file_id = a.metadata.get("file_id")
|
||||
b_file_id = b.metadata.get("file_id")
|
||||
|
||||
if a_file_id is not None and b_file_id is not None:
|
||||
return a_file_id == b_file_id
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def merge_docs_to_target_size(
|
||||
request: Request,
|
||||
chunks: list[Document],
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Best-effort normalization of chunk sizes.
|
||||
|
||||
Attempts to grow small chunks up to a desired minimum size,
|
||||
without exceeding the maximum size or crossing source/file
|
||||
boundaries.
|
||||
"""
|
||||
min_chunk_size_target = request.app.state.config.CHUNK_MIN_SIZE_TARGET
|
||||
max_chunk_size = request.app.state.config.CHUNK_SIZE
|
||||
|
||||
if min_chunk_size_target <= 0:
|
||||
return chunks
|
||||
|
||||
measure_chunk_size = len
|
||||
if request.app.state.config.TEXT_SPLITTER == "token":
|
||||
encoding = tiktoken.get_encoding(
|
||||
str(request.app.state.config.TIKTOKEN_ENCODING_NAME)
|
||||
)
|
||||
measure_chunk_size = lambda text: len(encoding.encode(text))
|
||||
|
||||
processed_chunks: list[Document] = []
|
||||
|
||||
current_chunk: Document | None = None
|
||||
current_content: str = ""
|
||||
|
||||
for next_chunk in chunks:
|
||||
if current_chunk is None:
|
||||
current_chunk = next_chunk
|
||||
current_content = next_chunk.page_content
|
||||
continue # First chunk initialization
|
||||
|
||||
proposed_content = f"{current_content}\n\n{next_chunk.page_content}"
|
||||
|
||||
can_absorb = (
|
||||
can_merge_chunks(current_chunk, next_chunk)
|
||||
and measure_chunk_size(current_content) < min_chunk_size_target
|
||||
and measure_chunk_size(proposed_content) <= max_chunk_size
|
||||
)
|
||||
|
||||
if can_absorb:
|
||||
current_content = proposed_content
|
||||
else:
|
||||
processed_chunks.append(
|
||||
Document(
|
||||
page_content=current_content,
|
||||
metadata={**current_chunk.metadata},
|
||||
)
|
||||
)
|
||||
current_chunk = next_chunk
|
||||
current_content = next_chunk.page_content
|
||||
|
||||
if current_chunk is not None:
|
||||
processed_chunks.append(
|
||||
Document(
|
||||
page_content=current_content,
|
||||
metadata={**current_chunk.metadata},
|
||||
)
|
||||
)
|
||||
|
||||
return processed_chunks
|
||||
|
||||
|
||||
def save_docs_to_vector_db(
|
||||
request: Request,
|
||||
docs,
|
||||
@@ -1332,7 +1419,6 @@ def save_docs_to_vector_db(
|
||||
if split:
|
||||
if request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER:
|
||||
log.info("Using markdown header text splitter")
|
||||
|
||||
# Define headers to split on - covering most common markdown header levels
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=[
|
||||
@@ -1361,6 +1447,8 @@ def save_docs_to_vector_db(
|
||||
)
|
||||
|
||||
docs = split_docs
|
||||
if request.app.state.config.CHUNK_MIN_SIZE_TARGET > 0:
|
||||
docs = merge_docs_to_target_size(request, docs)
|
||||
|
||||
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
|
||||
@@ -666,12 +666,12 @@
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div class="flex w-full mt-2">
|
||||
<SensitiveInput
|
||||
placeholder={$i18n.t('Enter MinerU API Key')}
|
||||
bind:value={RAGConfig.MINERU_API_KEY}
|
||||
/>
|
||||
</div>
|
||||
<div class="flex w-full mt-2">
|
||||
<SensitiveInput
|
||||
placeholder={$i18n.t('Enter MinerU API Key')}
|
||||
bind:value={RAGConfig.MINERU_API_KEY}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div class="flex w-full mt-2">
|
||||
<div class="flex-1 flex justify-between">
|
||||
@@ -798,6 +798,35 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{#if RAGConfig.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER}
|
||||
<div class=" mb-2.5 flex w-full justify-between">
|
||||
<div class=" flex gap-1.5 w-full">
|
||||
<div class="w-full">
|
||||
<div class="self-center text-xs font-medium min-w-fit mb-1">
|
||||
<Tooltip
|
||||
placement="top-start"
|
||||
content={$i18n.t(
|
||||
'Chunks smaller than this threshold will be merged with neighboring chunks when possible. Set to 0 to disable merging.'
|
||||
)}
|
||||
>
|
||||
{$i18n.t('Chunk Min Size Target')}
|
||||
</Tooltip>
|
||||
</div>
|
||||
<div class="self-center">
|
||||
<input
|
||||
class="w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
|
||||
type="number"
|
||||
placeholder={$i18n.t('Enter Chunk Min Size Target')}
|
||||
bind:value={RAGConfig.CHUNK_MIN_SIZE_TARGET}
|
||||
autocomplete="off"
|
||||
min="0"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user