diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 4d9b96215..7dc4be9f0 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2650,6 +2650,18 @@ AUDIO_STT_AZURE_LOCALES = PersistentConfig( os.getenv("AUDIO_STT_AZURE_LOCALES", ""), ) +AUDIO_STT_AZURE_BASE_URL = PersistentConfig( + "AUDIO_STT_AZURE_BASE_URL", + "audio.stt.azure.base_url", + os.getenv("AUDIO_STT_AZURE_BASE_URL", ""), +) + +AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig( + "AUDIO_STT_AZURE_MAX_SPEAKERS", + "audio.stt.azure.max_speakers", + os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", "3"), +) + AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig( "AUDIO_TTS_OPENAI_API_BASE_URL", "audio.tts.openai.api_base_url", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 27cdb8691..862f554f4 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -155,6 +155,8 @@ from open_webui.config import ( AUDIO_STT_AZURE_API_KEY, AUDIO_STT_AZURE_REGION, AUDIO_STT_AZURE_LOCALES, + AUDIO_STT_AZURE_BASE_URL, + AUDIO_STT_AZURE_MAX_SPEAKERS, AUDIO_TTS_API_KEY, AUDIO_TTS_ENGINE, AUDIO_TTS_MODEL, @@ -829,6 +831,8 @@ app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES +app.state.config.AUDIO_STT_AZURE_BASE_URL = AUDIO_STT_AZURE_BASE_URL +app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = AUDIO_STT_AZURE_MAX_SPEAKERS app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY diff --git a/backend/open_webui/routers/audio.py b/backend/open_webui/routers/audio.py index da51d1ecf..e4d945dc1 100644 --- a/backend/open_webui/routers/audio.py +++ b/backend/open_webui/routers/audio.py @@ -150,7 +150,8 @@ class STTConfigForm(BaseModel): AZURE_API_KEY: str AZURE_REGION: str AZURE_LOCALES: str - + AZURE_BASE_URL: str + AZURE_MAX_SPEAKERS: str class AudioConfigUpdateForm(BaseModel): tts: TTSConfigForm @@ -181,6 +182,8 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)): "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION, "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES, + "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL, + "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS, }, } @@ -210,6 +213,8 @@ async def update_audio_config( request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES + request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL + request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = form_data.stt.AZURE_MAX_SPEAKERS if request.app.state.config.STT_ENGINE == "": request.app.state.faster_whisper_model = set_faster_whisper_model( @@ -238,6 +243,8 @@ async def update_audio_config( "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION, "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES, + "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL, + "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS, }, } @@ -641,6 +648,8 @@ def transcribe(request: Request, file_path): api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY region = request.app.state.config.AUDIO_STT_AZURE_REGION locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES + base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL + max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS # IF NO LOCALES, USE DEFAULTS if len(locales) < 2: @@ -664,7 +673,13 @@ def transcribe(request: Request, file_path): if not api_key or not region: raise HTTPException( status_code=400, - detail="Azure API key and region are required for Azure STT", + detail="Azure API key is required for Azure STT", + ) + + if not base_url and not region: + raise HTTPException( + status_code=400, + detail="Azure region or base url is required for Azure STT", ) r = None @@ -674,13 +689,14 @@ def transcribe(request: Request, file_path): "definition": json.dumps( { "locales": locales.split(","), - "diarization": {"maxSpeakers": 3, "enabled": True}, + "diarization": {"maxSpeakers": max_speakers, "enabled": True}, } if locales else {} ) } - url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15" + + url = base_url or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15" # Use context manager to ensure file is properly closed with open(file_path, "rb") as audio_file: diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 52b874935..29c3569b5 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -42,6 +42,8 @@ let STT_AZURE_API_KEY = ''; let STT_AZURE_REGION = ''; let STT_AZURE_LOCALES = ''; + let STT_AZURE_BASE_URL = ''; + let STT_AZURE_MAX_SPEAKERS = ''; let STT_DEEPGRAM_API_KEY = ''; let STT_WHISPER_MODEL_LOADING = false; @@ -114,7 +116,9 @@ DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY, AZURE_API_KEY: STT_AZURE_API_KEY, AZURE_REGION: STT_AZURE_REGION, - AZURE_LOCALES: STT_AZURE_LOCALES + AZURE_LOCALES: STT_AZURE_LOCALES, + AZURE_BASE_URL: STT_AZURE_BASE_URL, + AZURE_MAX_SPEAKERS: STT_AZURE_MAX_SPEAKERS } }); @@ -157,6 +161,8 @@ STT_AZURE_API_KEY = res.stt.AZURE_API_KEY; STT_AZURE_REGION = res.stt.AZURE_REGION; STT_AZURE_LOCALES = res.stt.AZURE_LOCALES; + STT_AZURE_BASE_URL = res.stt.AZURE_BASE_URL; + STT_AZURE_MAX_SPEAKERS = res.stt.AZURE_MAX_SPEAKERS; STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY; } @@ -287,6 +293,32 @@ /> + + +