feat: Adding file metadata to hybrid search (#19095)

* Added metadata to hybrid search

* And config and env plus refac

* consistency

---------

Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
Jacob Leksan
2025-11-18 15:29:07 -05:00
committed by GitHub
parent 42071cb8e8
commit 07ef295a77
5 changed files with 88 additions and 1 deletions

View File

@@ -148,6 +148,44 @@ def get_doc(collection_name: str, user: UserModel = None):
raise e
def get_enriched_texts(collection_result: GetResult) -> list[str]:
enriched_texts = []
for idx, text in enumerate(collection_result.documents[0]):
metadata = collection_result.metadatas[0][idx]
metadata_parts = [text]
# Add filename (repeat twice for extra weight in BM25 scoring)
if metadata.get("name"):
filename = metadata["name"]
filename_tokens = (
filename.replace("_", " ").replace("-", " ").replace(".", " ")
)
metadata_parts.append(
f"Filename: {filename} {filename_tokens} {filename_tokens}"
)
# Add title if available
if metadata.get("title"):
metadata_parts.append(f"Title: {metadata['title']}")
# Add document section headings if available (from markdown splitter)
if metadata.get("headings") and isinstance(metadata["headings"], list):
headings = " > ".join(str(h) for h in metadata["headings"])
metadata_parts.append(f"Section: {headings}")
# Add source URL/path if available
if metadata.get("source"):
metadata_parts.append(f"Source: {metadata['source']}")
# Add snippet for web search results
if metadata.get("snippet"):
metadata_parts.append(f"Snippet: {metadata['snippet']}")
enriched_texts.append(" ".join(metadata_parts))
return enriched_texts
def query_doc_with_hybrid_search(
collection_name: str,
collection_result: GetResult,
@@ -158,6 +196,7 @@ def query_doc_with_hybrid_search(
k_reranker: int,
r: float,
hybrid_bm25_weight: float,
enable_enriched_texts: bool = False,
) -> dict:
try:
# First check if collection_result has the required attributes
@@ -180,8 +219,14 @@ def query_doc_with_hybrid_search(
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
bm25_texts = (
get_enriched_texts(collection_result)
if enable_enriched_texts
else collection_result.documents[0]
)
bm25_retriever = BM25Retriever.from_texts(
texts=collection_result.documents[0],
texts=bm25_texts,
metadatas=collection_result.metadatas[0],
)
bm25_retriever.k = k
@@ -397,6 +442,7 @@ def query_collection_with_hybrid_search(
k_reranker: int,
r: float,
hybrid_bm25_weight: float,
enable_enriched_texts: bool = False,
) -> dict:
results = []
error = False
@@ -431,6 +477,7 @@ def query_collection_with_hybrid_search(
k_reranker=k_reranker,
r=r,
hybrid_bm25_weight=hybrid_bm25_weight,
enable_enriched_texts=enable_enriched_texts,
)
return result, None
except Exception as e:
@@ -762,6 +809,7 @@ def get_sources_from_items(
k_reranker=k_reranker,
r=r,
hybrid_bm25_weight=hybrid_bm25_weight,
enable_enriched_texts=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
)
except Exception as e:
log.debug(