Merge pull request #13379 from NoMoreFood/dev

feat: Azure STT Allow Base URL & Max Speaker Setting
This commit is contained in:
Tim Jaeryang Baek
2025-04-30 22:26:18 -07:00
committed by GitHub
4 changed files with 69 additions and 5 deletions

View File

@@ -150,7 +150,8 @@ class STTConfigForm(BaseModel):
AZURE_API_KEY: str
AZURE_REGION: str
AZURE_LOCALES: str
AZURE_BASE_URL: str
AZURE_MAX_SPEAKERS: str
class AudioConfigUpdateForm(BaseModel):
tts: TTSConfigForm
@@ -181,6 +182,8 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
"AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
"AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,
},
}
@@ -210,6 +213,8 @@ async def update_audio_config(
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL
request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = form_data.stt.AZURE_MAX_SPEAKERS
if request.app.state.config.STT_ENGINE == "":
request.app.state.faster_whisper_model = set_faster_whisper_model(
@@ -238,6 +243,8 @@ async def update_audio_config(
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
"AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
"AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,
},
}
@@ -641,6 +648,8 @@ def transcribe(request: Request, file_path):
api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
region = request.app.state.config.AUDIO_STT_AZURE_REGION
locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS
# IF NO LOCALES, USE DEFAULTS
if len(locales) < 2:
@@ -664,7 +673,13 @@ def transcribe(request: Request, file_path):
if not api_key or not region:
raise HTTPException(
status_code=400,
detail="Azure API key and region are required for Azure STT",
detail="Azure API key is required for Azure STT",
)
if not base_url and not region:
raise HTTPException(
status_code=400,
detail="Azure region or base url is required for Azure STT",
)
r = None
@@ -674,13 +689,14 @@ def transcribe(request: Request, file_path):
"definition": json.dumps(
{
"locales": locales.split(","),
"diarization": {"maxSpeakers": 3, "enabled": True},
"diarization": {"maxSpeakers": max_speakers, "enabled": True},
}
if locales
else {}
)
}
url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
url = base_url or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
# Use context manager to ensure file is properly closed
with open(file_path, "rb") as audio_file: