diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 8238f8a87..e5fda64a4 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2472,6 +2472,24 @@ AUDIO_STT_MODEL = PersistentConfig( os.getenv("AUDIO_STT_MODEL", ""), ) +AUDIO_STT_AZURE_API_KEY = PersistentConfig( + "AUDIO_STT_AZURE_API_KEY", + "audio.stt.azure.api_key", + os.getenv("AUDIO_STT_AZURE_API_KEY", ""), +) + +AUDIO_STT_AZURE_REGION = PersistentConfig( + "AUDIO_STT_AZURE_REGION", + "audio.stt.azure.region", + os.getenv("AUDIO_STT_AZURE_REGION", ""), +) + +AUDIO_STT_AZURE_LOCALES = PersistentConfig( + "AUDIO_STT_AZURE_LOCALES", + "audio.stt.azure.locales", + os.getenv("AUDIO_STT_AZURE_LOCALES", ""), +) + AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig( "AUDIO_TTS_OPENAI_API_BASE_URL", "audio.tts.openai.api_base_url", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index c9ca059c2..5565ef6fe 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -148,6 +148,9 @@ from open_webui.config import ( AUDIO_STT_MODEL, AUDIO_STT_OPENAI_API_BASE_URL, AUDIO_STT_OPENAI_API_KEY, + AUDIO_STT_AZURE_API_KEY, + AUDIO_STT_AZURE_REGION, + AUDIO_STT_AZURE_LOCALES, AUDIO_TTS_API_KEY, AUDIO_TTS_ENGINE, AUDIO_TTS_MODEL, @@ -778,6 +781,10 @@ app.state.config.STT_MODEL = AUDIO_STT_MODEL app.state.config.WHISPER_MODEL = WHISPER_MODEL app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY +app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY +app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION +app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES + app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE diff --git a/backend/open_webui/routers/audio.py b/backend/open_webui/routers/audio.py index ea1372623..cbe005511 100644 --- a/backend/open_webui/routers/audio.py +++ b/backend/open_webui/routers/audio.py @@ -50,6 +50,8 @@ router = APIRouter() # Constants MAX_FILE_SIZE_MB = 25 MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes +AZURE_MAX_FILE_SIZE_MB = 200 +AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["AUDIO"]) @@ -141,6 +143,9 @@ class STTConfigForm(BaseModel): MODEL: str WHISPER_MODEL: str DEEPGRAM_API_KEY: str + AZURE_API_KEY: str + AZURE_REGION: str + AZURE_LOCALES: str class AudioConfigUpdateForm(BaseModel): @@ -169,6 +174,9 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)): "MODEL": request.app.state.config.STT_MODEL, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, + "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, + "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION, + "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES, }, } @@ -195,6 +203,9 @@ async def update_audio_config( request.app.state.config.STT_MODEL = form_data.stt.MODEL request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY + request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY + request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION + request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES if request.app.state.config.STT_ENGINE == "": request.app.state.faster_whisper_model = set_faster_whisper_model( @@ -220,6 +231,9 @@ async def update_audio_config( "MODEL": request.app.state.config.STT_MODEL, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, + "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, + "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION, + "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES, }, } @@ -598,6 +612,107 @@ def transcribe(request: Request, file_path): detail = f"External: {e}" raise Exception(detail if detail else "Open WebUI: Server Connection Error") + elif request.app.state.config.STT_ENGINE == "azure": + # Check file exists and size + if not os.path.exists(file_path): + raise HTTPException( + status_code=400, + detail="Audio file not found" + ) + + # Check file size (Azure has a larger limit of 200MB) + file_size = os.path.getsize(file_path) + if file_size > AZURE_MAX_FILE_SIZE: + raise HTTPException( + status_code=400, + detail=f"File size exceeds Azure's limit of {AZURE_MAX_FILE_SIZE_MB}MB", + ) + + api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY + region = request.app.state.config.AUDIO_STT_AZURE_REGION + locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES + + # IF NO LOCALES, USE DEFAULTS + if len(locales) < 2: + locales = ['en-US', 'es-ES', 'es-MX', 'fr-FR', 'hi-IN', + 'it-IT','de-DE', 'en-GB', 'en-IN', 'ja-JP', + 'ko-KR', 'pt-BR', 'zh-CN'] + locales = ','.join(locales) + + + if not api_key or not region: + raise HTTPException( + status_code=400, + detail="Azure API key and region are required for Azure STT", + ) + + r = None + try: + # Prepare the request + data = {'definition': json.dumps({ + "locales": locales.split(','), + "diarization": {"maxSpeakers": 3,"enabled": True} + } if locales else {} + ) + } + url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15" + + # Use context manager to ensure file is properly closed + with open(file_path, 'rb') as audio_file: + r = requests.post( + url=url, + files={'audio': audio_file}, + data=data, + headers={ + 'Ocp-Apim-Subscription-Key': api_key, + }, + ) + + r.raise_for_status() + response = r.json() + + # Extract transcript from response + if not response.get('combinedPhrases'): + raise ValueError("No transcription found in response") + + # Get the full transcript from combinedPhrases + transcript = response['combinedPhrases'][0].get('text', '').strip() + if not transcript: + raise ValueError("Empty transcript in response") + + data = {"text": transcript} + + # Save transcript to json file (consistent with other providers) + transcript_file = f"{file_dir}/{id}.json" + with open(transcript_file, "w") as f: + json.dump(data, f) + + log.debug(data) + return data + + except (KeyError, IndexError, ValueError) as e: + log.exception("Error parsing Azure response") + raise HTTPException( + status_code=500, + detail=f"Failed to parse Azure response: {str(e)}", + ) + except requests.exceptions.RequestException as e: + log.exception(e) + detail = None + + try: + if r is not None and r.status_code != 200: + res = r.json() + if "error" in res: + detail = f"External: {res['error'].get('message', '')}" + except Exception: + detail = f"External: {e}" + + raise HTTPException( + status_code=getattr(r, 'status_code', 500) if r else 500, + detail=detail if detail else "Open WebUI: Server Connection Error", + ) + def compress_audio(file_path): if os.path.getsize(file_path) > MAX_FILE_SIZE: diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index d36f4af6c..cda6cf03c 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -39,6 +39,9 @@ let STT_ENGINE = ''; let STT_MODEL = ''; let STT_WHISPER_MODEL = ''; + let STT_AZURE_API_KEY = ''; + let STT_AZURE_REGION = ''; + let STT_AZURE_LOCALES = ''; let STT_DEEPGRAM_API_KEY = ''; let STT_WHISPER_MODEL_LOADING = false; @@ -103,12 +106,15 @@ AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT }, stt: { - OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, - OPENAI_API_KEY: STT_OPENAI_API_KEY, - ENGINE: STT_ENGINE, - MODEL: STT_MODEL, - WHISPER_MODEL: STT_WHISPER_MODEL, - DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY + OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, + OPENAI_API_KEY: STT_OPENAI_API_KEY, + ENGINE: STT_ENGINE, + MODEL: STT_MODEL, + WHISPER_MODEL: STT_WHISPER_MODEL, + DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY, + AZURE_API_KEY: STT_AZURE_API_KEY, + AZURE_REGION: STT_AZURE_REGION, + AZURE_LOCALES: STT_AZURE_LOCALES } }); @@ -144,10 +150,13 @@ STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; - + STT_ENGINE = res.stt.ENGINE; STT_MODEL = res.stt.MODEL; STT_WHISPER_MODEL = res.stt.WHISPER_MODEL; + STT_AZURE_API_KEY = res.stt.AZURE_API_KEY; + STT_AZURE_REGION = res.stt.AZURE_REGION; + STT_AZURE_LOCALES = res.stt.AZURE_LOCALES; STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY; } @@ -180,7 +189,8 @@ - + + @@ -248,8 +258,35 @@ + {:else if STT_ENGINE === 'azure'} +
+
+ + +
+ +
+ +
+
{$i18n.t('Language Locales')}
+
+
+ +
+
+
+
{:else if STT_ENGINE === ''} -
+
{$i18n.t('STT Model')}