diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 01f91ec64..3da1334b2 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2012,6 +2012,12 @@ WHISPER_MODEL_AUTO_UPDATE = ( and os.environ.get("WHISPER_MODEL_AUTO_UPDATE", "").lower() == "true" ) +# Add Deepgram configuration +DEEPGRAM_API_KEY = PersistentConfig( + "DEEPGRAM_API_KEY", + "audio.stt.deepgram.api_key", + os.getenv("DEEPGRAM_API_KEY", ""), +) AUDIO_STT_OPENAI_API_BASE_URL = PersistentConfig( "AUDIO_STT_OPENAI_API_BASE_URL", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 863f58dea..07b06d328 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -130,6 +130,7 @@ from open_webui.config import ( AUDIO_TTS_AZURE_SPEECH_REGION, AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, WHISPER_MODEL, + DEEPGRAM_API_KEY, WHISPER_MODEL_AUTO_UPDATE, WHISPER_MODEL_DIR, # Retrieval @@ -611,6 +612,7 @@ app.state.config.STT_ENGINE = AUDIO_STT_ENGINE app.state.config.STT_MODEL = AUDIO_STT_MODEL app.state.config.WHISPER_MODEL = WHISPER_MODEL +app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY diff --git a/backend/open_webui/routers/audio.py b/backend/open_webui/routers/audio.py index c1b15772b..7242042e2 100644 --- a/backend/open_webui/routers/audio.py +++ b/backend/open_webui/routers/audio.py @@ -11,6 +11,7 @@ from pydub.silence import split_on_silence import aiohttp import aiofiles import requests +import mimetypes from fastapi import ( Depends, @@ -138,6 +139,7 @@ class STTConfigForm(BaseModel): ENGINE: str MODEL: str WHISPER_MODEL: str + DEEPGRAM_API_KEY: str class AudioConfigUpdateForm(BaseModel): @@ -165,6 +167,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)): "ENGINE": request.app.state.config.STT_ENGINE, "MODEL": request.app.state.config.STT_MODEL, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, + "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, }, } @@ -190,6 +193,7 @@ async def update_audio_config( request.app.state.config.STT_ENGINE = form_data.stt.ENGINE request.app.state.config.STT_MODEL = form_data.stt.MODEL request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL + request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY if request.app.state.config.STT_ENGINE == "": request.app.state.faster_whisper_model = set_faster_whisper_model( @@ -214,6 +218,7 @@ async def update_audio_config( "ENGINE": request.app.state.config.STT_ENGINE, "MODEL": request.app.state.config.STT_MODEL, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, + "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, }, } @@ -521,6 +526,65 @@ def transcribe(request: Request, file_path): raise Exception(detail if detail else "Open WebUI: Server Connection Error") + elif request.app.state.config.STT_ENGINE == "deepgram": + try: + # Determine the MIME type of the file + mime, _ = mimetypes.guess_type(file_path) + if not mime: + mime = "audio/wav" # fallback to wav if undetectable + + # Read the audio file + with open(file_path, "rb") as f: + file_data = f.read() + + # Build headers and parameters + headers = { + "Authorization": f"Token {request.app.state.config.DEEPGRAM_API_KEY}", + "Content-Type": mime, + } + + # Add model if specified + params = {} + if request.app.state.config.STT_MODEL: + params["model"] = request.app.state.config.STT_MODEL + + # Make request to Deepgram API + r = requests.post( + "https://api.deepgram.com/v1/listen", + headers=headers, + params=params, + data=file_data, + ) + r.raise_for_status() + response_data = r.json() + + # Extract transcript from Deepgram response + try: + transcript = response_data["results"]["channels"][0]["alternatives"][0].get("transcript", "") + except (KeyError, IndexError) as e: + log.error(f"Malformed response from Deepgram: {str(e)}") + raise Exception("Failed to parse Deepgram response - unexpected response format") + data = {"text": transcript.strip()} + + # Save transcript + transcript_file = f"{file_dir}/{id}.json" + with open(transcript_file, "w") as f: + json.dump(data, f) + + return data + + except Exception as e: + log.exception(e) + detail = None + if r is not None: + try: + res = r.json() + if "error" in res: + detail = f"External: {res['error'].get('message', '')}" + except Exception: + detail = f"External: {e}" + raise Exception(detail if detail else "Open WebUI: Server Connection Error") + def compress_audio(file_path): if os.path.getsize(file_path) > MAX_FILE_SIZE: diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index f3b7e982c..69dcb55fa 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -39,6 +39,7 @@ let STT_ENGINE = ''; let STT_MODEL = ''; let STT_WHISPER_MODEL = ''; + let STT_DEEPGRAM_API_KEY = ''; let STT_WHISPER_MODEL_LOADING = false; @@ -103,7 +104,8 @@ OPENAI_API_KEY: STT_OPENAI_API_KEY, ENGINE: STT_ENGINE, MODEL: STT_MODEL, - WHISPER_MODEL: STT_WHISPER_MODEL + WHISPER_MODEL: STT_WHISPER_MODEL, + DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY } }); @@ -143,6 +145,7 @@ STT_ENGINE = res.stt.ENGINE; STT_MODEL = res.stt.MODEL; STT_WHISPER_MODEL = res.stt.WHISPER_MODEL; + STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY; } await getVoices(); @@ -173,6 +176,7 @@ + @@ -210,6 +214,37 @@ + {:else if STT_ENGINE === 'deepgram'} +