Merge pull request #13540 from NoMoreFood/dev

feat: Azure TTS Allow Base URL
2025-06-26 18:26:48 +00:00 · 2025-05-07 00:49:57 +04:00
parent e0e151f4f4 5c055b73a3
commit 2a4dfc02a2
4 changed files with 76 additions and 37 deletions
--- a/backend/open_webui/routers/audio.py
+++ b/backend/open_webui/routers/audio.py
@@ -138,6 +138,7 @@ class TTSConfigForm(BaseModel):
    VOICE: str
    SPLIT_ON: str
    AZURE_SPEECH_REGION: str
+    AZURE_SPEECH_BASE_URL: str
    AZURE_SPEECH_OUTPUT_FORMAT: str


@@ -172,6 +173,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
            "VOICE": request.app.state.config.TTS_VOICE,
            "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
            "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
+            "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
            "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
        },
        "stt": {
@@ -202,6 +204,9 @@ async def update_audio_config(
    request.app.state.config.TTS_VOICE = form_data.tts.VOICE
    request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
    request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION
+    request.app.state.config.TTS_AZURE_SPEECH_BASE_URL = (
+        form_data.tts.AZURE_SPEECH_BASE_URL
+    )
    request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = (
        form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT
    )
@@ -235,6 +240,7 @@ async def update_audio_config(
            "VOICE": request.app.state.config.TTS_VOICE,
            "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
            "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
+            "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
            "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
        },
        "stt": {
@@ -406,7 +412,8 @@ async def speech(request: Request, user=Depends(get_verified_user)):
            log.exception(e)
            raise HTTPException(status_code=400, detail="Invalid JSON payload")

-        region = request.app.state.config.TTS_AZURE_SPEECH_REGION
+        region = request.app.state.config.TTS_AZURE_SPEECH_REGION or "eastus"
+        base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
        language = request.app.state.config.TTS_VOICE
        locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1])
        output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT
@@ -420,7 +427,8 @@ async def speech(request: Request, user=Depends(get_verified_user)):
                timeout=timeout, trust_env=True
            ) as session:
                async with session.post(
-                    f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1",
+                    (base_url or f"https://{region}.tts.speech.microsoft.com")
+                    + "/cognitiveservices/v1",
                    headers={
                        "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY,
                        "Content-Type": "application/ssml+xml",
@@ -651,10 +659,10 @@ def transcribe(request: Request, file_path):
            )

        api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
-        region = request.app.state.config.AUDIO_STT_AZURE_REGION
+        region = request.app.state.config.AUDIO_STT_AZURE_REGION or "eastus"
        locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
        base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
-        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS
+        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS or 3

        # IF NO LOCALES, USE DEFAULTS
        if len(locales) < 2:
@@ -681,12 +689,6 @@ def transcribe(request: Request, file_path):
                detail="Azure API key is required for Azure STT",
            )

-        if not base_url and not region:
-            raise HTTPException(
-                status_code=400,
-                detail="Azure region or base url is required for Azure STT",
-            )
-
        r = None
        try:
            # Prepare the request
@@ -702,9 +704,8 @@ def transcribe(request: Request, file_path):
            }

            url = (
-                base_url
-                or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
-            )
+                base_url or f"https://{region}.api.cognitive.microsoft.com"
+            ) + "/speechtotext/transcriptions:transcribe?api-version=2024-11-15"

            # Use context manager to ensure file is properly closed
            with open(file_path, "rb") as audio_file:
@@ -939,7 +940,10 @@ def get_available_voices(request) -> dict:
    elif request.app.state.config.TTS_ENGINE == "azure":
        try:
            region = request.app.state.config.TTS_AZURE_SPEECH_REGION
-            url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list"
+            base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
+            url = (
+                base_url or f"https://{region}.tts.speech.microsoft.com"
+            ) + "/cognitiveservices/voices/list"
            headers = {
                "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY
            }