diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index a6cffeecd..e6851cfe7 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2689,7 +2689,7 @@ AUDIO_STT_AZURE_BASE_URL = PersistentConfig( AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig( "AUDIO_STT_AZURE_MAX_SPEAKERS", "audio.stt.azure.max_speakers", - os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", "3"), + os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", ""), ) AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig( @@ -2737,7 +2737,13 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig( AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig( "AUDIO_TTS_AZURE_SPEECH_REGION", "audio.tts.azure.speech_region", - os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"), + os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", ""), +) + +AUDIO_TTS_AZURE_SPEECH_BASE_URL = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_BASE_URL", + "audio.tts.azure.speech_base_url", + os.getenv("AUDIO_TTS_AZURE_SPEECH_BASE_URL", ""), ) AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig( diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index db124bedd..aa066a8c1 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -166,6 +166,7 @@ from open_webui.config import ( AUDIO_TTS_SPLIT_ON, AUDIO_TTS_VOICE, AUDIO_TTS_AZURE_SPEECH_REGION, + AUDIO_TTS_AZURE_SPEECH_BASE_URL, AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, PLAYWRIGHT_WS_URL, PLAYWRIGHT_TIMEOUT, @@ -852,6 +853,7 @@ app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION +app.state.config.TTS_AZURE_SPEECH_BASE_URL = AUDIO_TTS_AZURE_SPEECH_BASE_URL app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT diff --git a/backend/open_webui/routers/audio.py b/backend/open_webui/routers/audio.py index fba41af4c..981b41d88 100644 --- a/backend/open_webui/routers/audio.py +++ b/backend/open_webui/routers/audio.py @@ -138,6 +138,7 @@ class TTSConfigForm(BaseModel): VOICE: str SPLIT_ON: str AZURE_SPEECH_REGION: str + AZURE_SPEECH_BASE_URL: str AZURE_SPEECH_OUTPUT_FORMAT: str @@ -172,6 +173,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)): "VOICE": request.app.state.config.TTS_VOICE, "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON, "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL, "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { @@ -202,6 +204,9 @@ async def update_audio_config( request.app.state.config.TTS_VOICE = form_data.tts.VOICE request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION + request.app.state.config.TTS_AZURE_SPEECH_BASE_URL = ( + form_data.tts.AZURE_SPEECH_BASE_URL + ) request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = ( form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT ) @@ -235,6 +240,7 @@ async def update_audio_config( "VOICE": request.app.state.config.TTS_VOICE, "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON, "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL, "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { @@ -406,7 +412,8 @@ async def speech(request: Request, user=Depends(get_verified_user)): log.exception(e) raise HTTPException(status_code=400, detail="Invalid JSON payload") - region = request.app.state.config.TTS_AZURE_SPEECH_REGION + region = request.app.state.config.TTS_AZURE_SPEECH_REGION or "eastus" + base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL language = request.app.state.config.TTS_VOICE locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1]) output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT @@ -420,7 +427,8 @@ async def speech(request: Request, user=Depends(get_verified_user)): timeout=timeout, trust_env=True ) as session: async with session.post( - f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1", + (base_url or f"https://{region}.tts.speech.microsoft.com") + + "/cognitiveservices/v1", headers={ "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY, "Content-Type": "application/ssml+xml", @@ -651,10 +659,10 @@ def transcribe(request: Request, file_path): ) api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY - region = request.app.state.config.AUDIO_STT_AZURE_REGION + region = request.app.state.config.AUDIO_STT_AZURE_REGION or "eastus" locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL - max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS + max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS or 3 # IF NO LOCALES, USE DEFAULTS if len(locales) < 2: @@ -681,12 +689,6 @@ def transcribe(request: Request, file_path): detail="Azure API key is required for Azure STT", ) - if not base_url and not region: - raise HTTPException( - status_code=400, - detail="Azure region or base url is required for Azure STT", - ) - r = None try: # Prepare the request @@ -702,9 +704,8 @@ def transcribe(request: Request, file_path): } url = ( - base_url - or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15" - ) + base_url or f"https://{region}.api.cognitive.microsoft.com" + ) + "/speechtotext/transcriptions:transcribe?api-version=2024-11-15" # Use context manager to ensure file is properly closed with open(file_path, "rb") as audio_file: @@ -933,7 +934,10 @@ def get_available_voices(request) -> dict: elif request.app.state.config.TTS_ENGINE == "azure": try: region = request.app.state.config.TTS_AZURE_SPEECH_REGION - url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list" + base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL + url = ( + base_url or f"https://{region}.tts.speech.microsoft.com" + ) + "/cognitiveservices/voices/list" headers = { "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY } diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 070ed9b69..60cb00700 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -32,6 +32,7 @@ let TTS_VOICE = ''; let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION; let TTS_AZURE_SPEECH_REGION = ''; + let TTS_AZURE_SPEECH_BASE_URL = ''; let TTS_AZURE_SPEECH_OUTPUT_FORMAT = ''; let STT_OPENAI_API_BASE_URL = ''; @@ -105,6 +106,7 @@ VOICE: TTS_VOICE, SPLIT_ON: TTS_SPLIT_ON, AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, + AZURE_SPEECH_BASE_URL: TTS_AZURE_SPEECH_BASE_URL, AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT }, stt: { @@ -149,8 +151,9 @@ TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION; - TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT; TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION; + TTS_AZURE_SPEECH_BASE_URL = res.tts.AZURE_SPEECH_BASE_URL; + TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT; STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; @@ -272,16 +275,23 @@ bind:value={STT_AZURE_API_KEY} required /> -