Files

29 lines
889 B
Python
Raw Permalink Normal View History

2025-07-31 17:45:06 +04:00
from datetime import datetime
2025-09-28 20:17:27 -05:00
KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"]
2025-07-31 17:45:06 +04:00
2025-09-28 20:17:27 -05:00
def filter_metadata(metadata: dict[str, any]) -> dict[str, any]:
2026-02-13 20:37:12 +01:00
# Removes large/redundant fields from metadata dict.
2025-09-28 20:17:27 -05:00
metadata = {
key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE
}
return metadata
def process_metadata(
2025-07-31 17:45:06 +04:00
metadata: dict[str, any],
) -> dict[str, any]:
2026-02-13 20:37:12 +01:00
# Removes large fields and converts non-serializable types (datetime, list, dict) to strings.
result = {}
2025-07-31 17:45:06 +04:00
for key, value in metadata.items():
2026-02-13 20:37:12 +01:00
# Skip large fields
2025-09-28 20:17:27 -05:00
if key in KEYS_TO_EXCLUDE:
2026-02-13 20:37:12 +01:00
continue
2025-09-28 20:17:27 -05:00
# Convert non-serializable fields to strings
2026-02-13 20:37:12 +01:00
if isinstance(value, (datetime, list, dict)):
result[key] = str(value)
else:
result[key] = value
return result