Merge pull request #13379 from NoMoreFood/dev

feat: Azure STT Allow Base URL & Max Speaker Setting
2025-06-26 18:26:48 +00:00 · 2025-04-30 22:26:18 -07:00
parent f3b38e9777 6c8a9d000e
commit e57f2c928a
4 changed files with 69 additions and 5 deletions
--- a/backend/open_webui/routers/audio.py
+++ b/backend/open_webui/routers/audio.py
@@ -150,7 +150,8 @@ class STTConfigForm(BaseModel):
    AZURE_API_KEY: str
    AZURE_REGION: str
    AZURE_LOCALES: str
-
+    AZURE_BASE_URL: str
+    AZURE_MAX_SPEAKERS: str

 class AudioConfigUpdateForm(BaseModel):
    tts: TTSConfigForm
@@ -181,6 +182,8 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
            "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
            "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
            "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
+            "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
+            "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,            
        },
    }

@@ -210,6 +213,8 @@ async def update_audio_config(
    request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
    request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
    request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
+    request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL
+    request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = form_data.stt.AZURE_MAX_SPEAKERS

    if request.app.state.config.STT_ENGINE == "":
        request.app.state.faster_whisper_model = set_faster_whisper_model(
@@ -238,6 +243,8 @@ async def update_audio_config(
            "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
            "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
            "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
+            "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
+            "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,    
        },
    }

@@ -641,6 +648,8 @@ def transcribe(request: Request, file_path):
        api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
        region = request.app.state.config.AUDIO_STT_AZURE_REGION
        locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
+        base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
+        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS

        # IF NO LOCALES, USE DEFAULTS
        if len(locales) < 2:
@@ -664,7 +673,13 @@ def transcribe(request: Request, file_path):
        if not api_key or not region:
            raise HTTPException(
                status_code=400,
-                detail="Azure API key and region are required for Azure STT",
+                detail="Azure API key is required for Azure STT",
+            )
+
+        if not base_url and not region:
+            raise HTTPException(
+                status_code=400,
+                detail="Azure region or base url is required for Azure STT",
            )

        r = None
@@ -674,13 +689,14 @@ def transcribe(request: Request, file_path):
                "definition": json.dumps(
                    {
                        "locales": locales.split(","),
-                        "diarization": {"maxSpeakers": 3, "enabled": True},
+                        "diarization": {"maxSpeakers": max_speakers, "enabled": True},
                    }
                    if locales
                    else {}
                )
            }
-            url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
+
+            url = base_url or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"

            # Use context manager to ensure file is properly closed
            with open(file_path, "rb") as audio_file: