mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-16 20:07:49 +01:00
feat: Adding file metadata to hybrid search (#19095)
* Added metadata to hybrid search * And config and env plus refac * consistency --------- Co-authored-by: Tim Baek <tim@openwebui.com>
This commit is contained in:
@@ -148,6 +148,44 @@ def get_doc(collection_name: str, user: UserModel = None):
|
||||
raise e
|
||||
|
||||
|
||||
def get_enriched_texts(collection_result: GetResult) -> list[str]:
|
||||
enriched_texts = []
|
||||
for idx, text in enumerate(collection_result.documents[0]):
|
||||
metadata = collection_result.metadatas[0][idx]
|
||||
metadata_parts = [text]
|
||||
|
||||
# Add filename (repeat twice for extra weight in BM25 scoring)
|
||||
if metadata.get("name"):
|
||||
filename = metadata["name"]
|
||||
filename_tokens = (
|
||||
filename.replace("_", " ").replace("-", " ").replace(".", " ")
|
||||
)
|
||||
metadata_parts.append(
|
||||
f"Filename: {filename} {filename_tokens} {filename_tokens}"
|
||||
)
|
||||
|
||||
# Add title if available
|
||||
if metadata.get("title"):
|
||||
metadata_parts.append(f"Title: {metadata['title']}")
|
||||
|
||||
# Add document section headings if available (from markdown splitter)
|
||||
if metadata.get("headings") and isinstance(metadata["headings"], list):
|
||||
headings = " > ".join(str(h) for h in metadata["headings"])
|
||||
metadata_parts.append(f"Section: {headings}")
|
||||
|
||||
# Add source URL/path if available
|
||||
if metadata.get("source"):
|
||||
metadata_parts.append(f"Source: {metadata['source']}")
|
||||
|
||||
# Add snippet for web search results
|
||||
if metadata.get("snippet"):
|
||||
metadata_parts.append(f"Snippet: {metadata['snippet']}")
|
||||
|
||||
enriched_texts.append(" ".join(metadata_parts))
|
||||
|
||||
return enriched_texts
|
||||
|
||||
|
||||
def query_doc_with_hybrid_search(
|
||||
collection_name: str,
|
||||
collection_result: GetResult,
|
||||
@@ -158,6 +196,7 @@ def query_doc_with_hybrid_search(
|
||||
k_reranker: int,
|
||||
r: float,
|
||||
hybrid_bm25_weight: float,
|
||||
enable_enriched_texts: bool = False,
|
||||
) -> dict:
|
||||
try:
|
||||
# First check if collection_result has the required attributes
|
||||
@@ -180,8 +219,14 @@ def query_doc_with_hybrid_search(
|
||||
|
||||
log.debug(f"query_doc_with_hybrid_search:doc {collection_name}")
|
||||
|
||||
bm25_texts = (
|
||||
get_enriched_texts(collection_result)
|
||||
if enable_enriched_texts
|
||||
else collection_result.documents[0]
|
||||
)
|
||||
|
||||
bm25_retriever = BM25Retriever.from_texts(
|
||||
texts=collection_result.documents[0],
|
||||
texts=bm25_texts,
|
||||
metadatas=collection_result.metadatas[0],
|
||||
)
|
||||
bm25_retriever.k = k
|
||||
@@ -397,6 +442,7 @@ def query_collection_with_hybrid_search(
|
||||
k_reranker: int,
|
||||
r: float,
|
||||
hybrid_bm25_weight: float,
|
||||
enable_enriched_texts: bool = False,
|
||||
) -> dict:
|
||||
results = []
|
||||
error = False
|
||||
@@ -431,6 +477,7 @@ def query_collection_with_hybrid_search(
|
||||
k_reranker=k_reranker,
|
||||
r=r,
|
||||
hybrid_bm25_weight=hybrid_bm25_weight,
|
||||
enable_enriched_texts=enable_enriched_texts,
|
||||
)
|
||||
return result, None
|
||||
except Exception as e:
|
||||
@@ -762,6 +809,7 @@ def get_sources_from_items(
|
||||
k_reranker=k_reranker,
|
||||
r=r,
|
||||
hybrid_bm25_weight=hybrid_bm25_weight,
|
||||
enable_enriched_texts=request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
||||
)
|
||||
except Exception as e:
|
||||
log.debug(
|
||||
|
||||
Reference in New Issue
Block a user