Merge pull request #13379 from NoMoreFood/dev

feat: Azure STT Allow Base URL & Max Speaker Setting
This commit is contained in:
Tim Jaeryang Baek 2025-04-30 22:26:18 -07:00 committed by GitHub
commit e57f2c928a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 69 additions and 5 deletions

View File

@ -2650,6 +2650,18 @@ AUDIO_STT_AZURE_LOCALES = PersistentConfig(
os.getenv("AUDIO_STT_AZURE_LOCALES", ""),
)
AUDIO_STT_AZURE_BASE_URL = PersistentConfig(
"AUDIO_STT_AZURE_BASE_URL",
"audio.stt.azure.base_url",
os.getenv("AUDIO_STT_AZURE_BASE_URL", ""),
)
AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig(
"AUDIO_STT_AZURE_MAX_SPEAKERS",
"audio.stt.azure.max_speakers",
os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", "3"),
)
AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
"AUDIO_TTS_OPENAI_API_BASE_URL",
"audio.tts.openai.api_base_url",

View File

@ -155,6 +155,8 @@ from open_webui.config import (
AUDIO_STT_AZURE_API_KEY,
AUDIO_STT_AZURE_REGION,
AUDIO_STT_AZURE_LOCALES,
AUDIO_STT_AZURE_BASE_URL,
AUDIO_STT_AZURE_MAX_SPEAKERS,
AUDIO_TTS_API_KEY,
AUDIO_TTS_ENGINE,
AUDIO_TTS_MODEL,
@ -829,6 +831,8 @@ app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY
app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION
app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES
app.state.config.AUDIO_STT_AZURE_BASE_URL = AUDIO_STT_AZURE_BASE_URL
app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = AUDIO_STT_AZURE_MAX_SPEAKERS
app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY

View File

@ -150,7 +150,8 @@ class STTConfigForm(BaseModel):
AZURE_API_KEY: str
AZURE_REGION: str
AZURE_LOCALES: str
AZURE_BASE_URL: str
AZURE_MAX_SPEAKERS: str
class AudioConfigUpdateForm(BaseModel):
tts: TTSConfigForm
@ -181,6 +182,8 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
"AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
"AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,
},
}
@ -210,6 +213,8 @@ async def update_audio_config(
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL
request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = form_data.stt.AZURE_MAX_SPEAKERS
if request.app.state.config.STT_ENGINE == "":
request.app.state.faster_whisper_model = set_faster_whisper_model(
@ -238,6 +243,8 @@ async def update_audio_config(
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
"AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
"AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,
},
}
@ -641,6 +648,8 @@ def transcribe(request: Request, file_path):
api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
region = request.app.state.config.AUDIO_STT_AZURE_REGION
locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS
# IF NO LOCALES, USE DEFAULTS
if len(locales) < 2:
@ -664,7 +673,13 @@ def transcribe(request: Request, file_path):
if not api_key or not region:
raise HTTPException(
status_code=400,
detail="Azure API key and region are required for Azure STT",
detail="Azure API key is required for Azure STT",
)
if not base_url and not region:
raise HTTPException(
status_code=400,
detail="Azure region or base url is required for Azure STT",
)
r = None
@ -674,13 +689,14 @@ def transcribe(request: Request, file_path):
"definition": json.dumps(
{
"locales": locales.split(","),
"diarization": {"maxSpeakers": 3, "enabled": True},
"diarization": {"maxSpeakers": max_speakers, "enabled": True},
}
if locales
else {}
)
}
url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
url = base_url or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
# Use context manager to ensure file is properly closed
with open(file_path, "rb") as audio_file:

View File

@ -42,6 +42,8 @@
let STT_AZURE_API_KEY = '';
let STT_AZURE_REGION = '';
let STT_AZURE_LOCALES = '';
let STT_AZURE_BASE_URL = '';
let STT_AZURE_MAX_SPEAKERS = '';
let STT_DEEPGRAM_API_KEY = '';
let STT_WHISPER_MODEL_LOADING = false;
@ -114,7 +116,9 @@
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
AZURE_API_KEY: STT_AZURE_API_KEY,
AZURE_REGION: STT_AZURE_REGION,
AZURE_LOCALES: STT_AZURE_LOCALES
AZURE_LOCALES: STT_AZURE_LOCALES,
AZURE_BASE_URL: STT_AZURE_BASE_URL,
AZURE_MAX_SPEAKERS: STT_AZURE_MAX_SPEAKERS
}
});
@ -157,6 +161,8 @@
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
STT_AZURE_REGION = res.stt.AZURE_REGION;
STT_AZURE_LOCALES = res.stt.AZURE_LOCALES;
STT_AZURE_BASE_URL = res.stt.AZURE_BASE_URL;
STT_AZURE_MAX_SPEAKERS = res.stt.AZURE_MAX_SPEAKERS;
STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
}
@ -287,6 +293,32 @@
/>
</div>
</div>
</div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Base URL')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={STT_AZURE_BASE_URL}
placeholder={$i18n.t('(leave blank for Azure Commercial URL auto-generation)')}
/>
</div>
</div>
</div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Max Speakers')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={STT_AZURE_MAX_SPEAKERS}
placeholder={$i18n.t('e.g., 3, 4, 5 (leave blank for default)')}
/>
</div>
</div>
</div>
</div>
{:else if STT_ENGINE === ''}