diff --git a/backend/open_webui/routers/files.py b/backend/open_webui/routers/files.py index 9022ca66ff..5fbbbf3cc4 100644 --- a/backend/open_webui/routers/files.py +++ b/backend/open_webui/routers/files.py @@ -121,6 +121,27 @@ def has_access_to_file( ############################ +def _is_text_file(file_path: str, chunk_size: int = 8192) -> bool: + """Check if a file is likely a text file by reading a chunk and validating UTF-8. + + This catches files whose extensions are mis-mapped by mimetypes/browsers + (e.g. TypeScript .ts → video/mp2t) without maintaining an extension whitelist. + """ + try: + resolved = Storage.get_file(file_path) + with open(resolved, "rb") as f: + chunk = f.read(chunk_size) + if not chunk: + return False + # Null bytes are a strong indicator of binary content + if b"\x00" in chunk: + return False + chunk.decode("utf-8") + return True + except (UnicodeDecodeError, Exception): + return False + + def process_uploaded_file( request, file, @@ -132,13 +153,20 @@ def process_uploaded_file( ): def _process_handler(db_session): try: - if file.content_type: + content_type = file.content_type + + # Detect mis-labeled text files (e.g. .ts → video/mp2t) + if content_type and content_type.startswith(("image/", "video/")): + if _is_text_file(file_path): + content_type = "text/plain" + + if content_type: stt_supported_content_types = getattr( request.app.state.config, "STT_SUPPORTED_CONTENT_TYPES", [] ) if strict_match_mime_type( - stt_supported_content_types, file.content_type + stt_supported_content_types, content_type ): file_path_processed = Storage.get_file(file_path) result = transcribe( @@ -153,7 +181,7 @@ def process_uploaded_file( user=user, db=db_session, ) - elif (not file.content_type.startswith(("image/", "video/"))) or ( + elif (not content_type.startswith(("image/", "video/"))) or ( request.app.state.config.CONTENT_EXTRACTION_ENGINE == "external" ): process_file( @@ -164,7 +192,7 @@ def process_uploaded_file( ) else: raise Exception( - f"File type {file.content_type} is not supported for processing" + f"File type {content_type} is not supported for processing" ) else: log.info(