From d9fd2a3f30481efa24cc54193bf2f67fd0299b52 Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Sun, 22 Feb 2026 18:42:25 -0600 Subject: [PATCH] refac --- backend/open_webui/retrieval/utils.py | 32 ++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/backend/open_webui/retrieval/utils.py b/backend/open_webui/retrieval/utils.py index d328ba51a8..82896f00f9 100644 --- a/backend/open_webui/retrieval/utils.py +++ b/backend/open_webui/retrieval/utils.py @@ -88,6 +88,14 @@ def get_content_from_url(request, url: str) -> str: return content, docs +CHUNK_HASH_KEY = "_chunk_hash" + + +def _content_hash(text: str) -> str: + """SHA-256 hash of text, used as a stable chunk identifier for RRF dedup.""" + return hashlib.sha256(text.encode()).hexdigest() + + class VectorSearchRetriever(BaseRetriever): collection_name: Any embedding_function: Any @@ -126,9 +134,11 @@ class VectorSearchRetriever(BaseRetriever): results = [] for idx in range(len(ids)): + metadata = metadatas[idx] + metadata[CHUNK_HASH_KEY] = _content_hash(documents[idx]) results.append( Document( - metadata=metadatas[idx], + metadata=metadata, page_content=documents[idx], ) ) @@ -240,15 +250,21 @@ async def query_doc_with_hybrid_search( log.debug(f"query_doc_with_hybrid_search:doc {collection_name}") + original_texts = collection_result.documents[0] + bm25_metadatas = [ + {**meta, CHUNK_HASH_KEY: _content_hash(original_texts[idx])} + for idx, meta in enumerate(collection_result.metadatas[0]) + ] + bm25_texts = ( get_enriched_texts(collection_result) if enable_enriched_texts - else collection_result.documents[0] + else original_texts ) bm25_retriever = BM25Retriever.from_texts( texts=bm25_texts, - metadatas=collection_result.metadatas[0], + metadatas=bm25_metadatas, ) bm25_retriever.k = k @@ -258,18 +274,24 @@ async def query_doc_with_hybrid_search( top_k=k, ) + # Use CHUNK_HASH_KEY for dedup so enriched BM25 texts don't defeat RRF if hybrid_bm25_weight <= 0: ensemble_retriever = EnsembleRetriever( - retrievers=[vector_search_retriever], weights=[1.0] + retrievers=[vector_search_retriever], + weights=[1.0], + id_key=CHUNK_HASH_KEY, ) elif hybrid_bm25_weight >= 1: ensemble_retriever = EnsembleRetriever( - retrievers=[bm25_retriever], weights=[1.0] + retrievers=[bm25_retriever], + weights=[1.0], + id_key=CHUNK_HASH_KEY, ) else: ensemble_retriever = EnsembleRetriever( retrievers=[bm25_retriever, vector_search_retriever], weights=[hybrid_bm25_weight, 1.0 - hybrid_bm25_weight], + id_key=CHUNK_HASH_KEY, ) compressor = RerankCompressor(