Merge pull request #9242 from NachoNoCheese/dev

feat: Add Deepgram STT Support
2025-12-16 03:47:49 +01:00 · 2025-02-07 23:06:57 -08:00
parent 79c0b45543 a004bf2a6d
commit 8c2164928a
4 changed files with 108 additions and 1 deletions
--- a/backend/open_webui/routers/audio.py
+++ b/backend/open_webui/routers/audio.py
@@ -11,6 +11,7 @@ from pydub.silence import split_on_silence
 import aiohttp
 import aiofiles
 import requests
+import mimetypes

 from fastapi import (
    Depends,
@@ -138,6 +139,7 @@ class STTConfigForm(BaseModel):
    ENGINE: str
    MODEL: str
    WHISPER_MODEL: str
+    DEEPGRAM_API_KEY: str


 class AudioConfigUpdateForm(BaseModel):
@@ -165,6 +167,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
            "ENGINE": request.app.state.config.STT_ENGINE,
            "MODEL": request.app.state.config.STT_MODEL,
            "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
+            "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
        },
    }

@@ -190,6 +193,7 @@ async def update_audio_config(
    request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
    request.app.state.config.STT_MODEL = form_data.stt.MODEL
    request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
+    request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY

    if request.app.state.config.STT_ENGINE == "":
        request.app.state.faster_whisper_model = set_faster_whisper_model(
@@ -214,6 +218,7 @@ async def update_audio_config(
            "ENGINE": request.app.state.config.STT_ENGINE,
            "MODEL": request.app.state.config.STT_MODEL,
            "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
+            "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
        },
    }

@@ -521,6 +526,65 @@ def transcribe(request: Request, file_path):

            raise Exception(detail if detail else "Open WebUI: Server Connection Error")

+    elif request.app.state.config.STT_ENGINE == "deepgram":
+        try:
+            # Determine the MIME type of the file
+            mime, _ = mimetypes.guess_type(file_path)
+            if not mime:
+                mime = "audio/wav"  # fallback to wav if undetectable
+
+            # Read the audio file
+            with open(file_path, "rb") as f:
+                file_data = f.read()
+
+            # Build headers and parameters
+            headers = {
+                "Authorization": f"Token {request.app.state.config.DEEPGRAM_API_KEY}",
+                "Content-Type": mime,
+            }
+
+            # Add model if specified
+            params = {}
+            if request.app.state.config.STT_MODEL:
+                params["model"] = request.app.state.config.STT_MODEL
+
+            # Make request to Deepgram API
+            r = requests.post(
+                "https://api.deepgram.com/v1/listen",
+                headers=headers,
+                params=params,
+                data=file_data,
+            )
+            r.raise_for_status()
+            response_data = r.json()
+
+            # Extract transcript from Deepgram response
+            try:
+                transcript = response_data["results"]["channels"][0]["alternatives"][0].get("transcript", "")
+            except (KeyError, IndexError) as e:
+                log.error(f"Malformed response from Deepgram: {str(e)}")
+                raise Exception("Failed to parse Deepgram response - unexpected response format")
+            data = {"text": transcript.strip()}
+
+            # Save transcript
+            transcript_file = f"{file_dir}/{id}.json"
+            with open(transcript_file, "w") as f:
+                json.dump(data, f)
+
+            return data
+
+        except Exception as e:
+            log.exception(e)
+            detail = None
+            if r is not None:
+                try:
+                    res = r.json()
+                    if "error" in res:
+                        detail = f"External: {res['error'].get('message', '')}"
+                except Exception:
+                    detail = f"External: {e}"
+            raise Exception(detail if detail else "Open WebUI: Server Connection Error")
+

 def compress_audio(file_path):
    if os.path.getsize(file_path) > MAX_FILE_SIZE: