enh: vector db hash collision check

This commit is contained in:
Timothy J. Baek
2024-10-03 06:53:21 -07:00
parent 78413d0c2e
commit 2fc07fd6a2
3 changed files with 50 additions and 0 deletions

View File

@@ -641,6 +641,16 @@ def save_docs_to_vector_db(
) -> bool:
log.info(f"save_docs_to_vector_db {docs} {collection_name}")
# Check if entries with the same hash (metadata.hash) already exist
if metadata and "hash" in metadata:
existing_docs = VECTOR_DB_CLIENT.query(
collection_name=collection_name,
filter={"hash": metadata["hash"]},
)
if existing_docs:
log.info(f"Document with hash {metadata['hash']} already exists")
return True
if split:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=app.state.config.CHUNK_SIZE,