fix: page number in knet

This commit is contained in:
trducng
2024-10-02 06:16:20 +00:00
parent 6ac627cc29
commit 485c6a5510

View File

@@ -91,6 +91,9 @@ class KnetRetrievalPipeline(BaseFileIndexRetriever):
chunks = yaml.safe_load(response.content)
for chunk in chunks:
metadata = chunk["node"]["metadata"]
metadata["page_label"] = metadata.get(
"pageIdx", metadata.get("parentPageIdx", "")
)
metadata["type"] = metadata_translation.get(
metadata.pop("content_type", ""), ""
)