This commit is contained in:
Timothy Jaeryang Baek
2026-02-22 17:05:39 -06:00
parent c341f97cfe
commit f651809001

View File

@@ -121,6 +121,27 @@ def has_access_to_file(
############################
def _is_text_file(file_path: str, chunk_size: int = 8192) -> bool:
"""Check if a file is likely a text file by reading a chunk and validating UTF-8.
This catches files whose extensions are mis-mapped by mimetypes/browsers
(e.g. TypeScript .ts → video/mp2t) without maintaining an extension whitelist.
"""
try:
resolved = Storage.get_file(file_path)
with open(resolved, "rb") as f:
chunk = f.read(chunk_size)
if not chunk:
return False
# Null bytes are a strong indicator of binary content
if b"\x00" in chunk:
return False
chunk.decode("utf-8")
return True
except (UnicodeDecodeError, Exception):
return False
def process_uploaded_file(
request,
file,
@@ -132,13 +153,20 @@ def process_uploaded_file(
):
def _process_handler(db_session):
try:
if file.content_type:
content_type = file.content_type
# Detect mis-labeled text files (e.g. .ts → video/mp2t)
if content_type and content_type.startswith(("image/", "video/")):
if _is_text_file(file_path):
content_type = "text/plain"
if content_type:
stt_supported_content_types = getattr(
request.app.state.config, "STT_SUPPORTED_CONTENT_TYPES", []
)
if strict_match_mime_type(
stt_supported_content_types, file.content_type
stt_supported_content_types, content_type
):
file_path_processed = Storage.get_file(file_path)
result = transcribe(
@@ -153,7 +181,7 @@ def process_uploaded_file(
user=user,
db=db_session,
)
elif (not file.content_type.startswith(("image/", "video/"))) or (
elif (not content_type.startswith(("image/", "video/"))) or (
request.app.state.config.CONTENT_EXTRACTION_ENGINE == "external"
):
process_file(
@@ -164,7 +192,7 @@ def process_uploaded_file(
)
else:
raise Exception(
f"File type {file.content_type} is not supported for processing"
f"File type {content_type} is not supported for processing"
)
else:
log.info(