feat: add Azure AI Speech STT provider

- Add Azure STT configuration variables for API key, region and locales
- Implement Azure STT transcription endpoint with 200MB file size limit
- Update Audio settings UI to include Azure STT configuration fields
- Handle Azure API responses and error cases consistently
This commit is contained in:
priten 2025-04-10 15:38:59 -05:00
parent 63533c9e3a
commit 9a55257c5b
4 changed files with 186 additions and 9 deletions

View File

@ -2472,6 +2472,24 @@ AUDIO_STT_MODEL = PersistentConfig(
os.getenv("AUDIO_STT_MODEL", ""),
)
AUDIO_STT_AZURE_API_KEY = PersistentConfig(
"AUDIO_STT_AZURE_API_KEY",
"audio.stt.azure.api_key",
os.getenv("AUDIO_STT_AZURE_API_KEY", ""),
)
AUDIO_STT_AZURE_REGION = PersistentConfig(
"AUDIO_STT_AZURE_REGION",
"audio.stt.azure.region",
os.getenv("AUDIO_STT_AZURE_REGION", ""),
)
AUDIO_STT_AZURE_LOCALES = PersistentConfig(
"AUDIO_STT_AZURE_LOCALES",
"audio.stt.azure.locales",
os.getenv("AUDIO_STT_AZURE_LOCALES", ""),
)
AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
"AUDIO_TTS_OPENAI_API_BASE_URL",
"audio.tts.openai.api_base_url",

View File

@ -148,6 +148,9 @@ from open_webui.config import (
AUDIO_STT_MODEL,
AUDIO_STT_OPENAI_API_BASE_URL,
AUDIO_STT_OPENAI_API_KEY,
AUDIO_STT_AZURE_API_KEY,
AUDIO_STT_AZURE_REGION,
AUDIO_STT_AZURE_LOCALES,
AUDIO_TTS_API_KEY,
AUDIO_TTS_ENGINE,
AUDIO_TTS_MODEL,
@ -778,6 +781,10 @@ app.state.config.STT_MODEL = AUDIO_STT_MODEL
app.state.config.WHISPER_MODEL = WHISPER_MODEL
app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY
app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION
app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES
app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE

View File

@ -50,6 +50,8 @@ router = APIRouter()
# Constants
MAX_FILE_SIZE_MB = 25
MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
AZURE_MAX_FILE_SIZE_MB = 200
AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["AUDIO"])
@ -141,6 +143,9 @@ class STTConfigForm(BaseModel):
MODEL: str
WHISPER_MODEL: str
DEEPGRAM_API_KEY: str
AZURE_API_KEY: str
AZURE_REGION: str
AZURE_LOCALES: str
class AudioConfigUpdateForm(BaseModel):
@ -169,6 +174,9 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
"MODEL": request.app.state.config.STT_MODEL,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
},
}
@ -195,6 +203,9 @@ async def update_audio_config(
request.app.state.config.STT_MODEL = form_data.stt.MODEL
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
if request.app.state.config.STT_ENGINE == "":
request.app.state.faster_whisper_model = set_faster_whisper_model(
@ -220,6 +231,9 @@ async def update_audio_config(
"MODEL": request.app.state.config.STT_MODEL,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
},
}
@ -598,6 +612,107 @@ def transcribe(request: Request, file_path):
detail = f"External: {e}"
raise Exception(detail if detail else "Open WebUI: Server Connection Error")
elif request.app.state.config.STT_ENGINE == "azure":
# Check file exists and size
if not os.path.exists(file_path):
raise HTTPException(
status_code=400,
detail="Audio file not found"
)
# Check file size (Azure has a larger limit of 200MB)
file_size = os.path.getsize(file_path)
if file_size > AZURE_MAX_FILE_SIZE:
raise HTTPException(
status_code=400,
detail=f"File size exceeds Azure's limit of {AZURE_MAX_FILE_SIZE_MB}MB",
)
api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
region = request.app.state.config.AUDIO_STT_AZURE_REGION
locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
# IF NO LOCALES, USE DEFAULTS
if len(locales) < 2:
locales = ['en-US', 'es-ES', 'es-MX', 'fr-FR', 'hi-IN',
'it-IT','de-DE', 'en-GB', 'en-IN', 'ja-JP',
'ko-KR', 'pt-BR', 'zh-CN']
locales = ','.join(locales)
if not api_key or not region:
raise HTTPException(
status_code=400,
detail="Azure API key and region are required for Azure STT",
)
r = None
try:
# Prepare the request
data = {'definition': json.dumps({
"locales": locales.split(','),
"diarization": {"maxSpeakers": 3,"enabled": True}
} if locales else {}
)
}
url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
# Use context manager to ensure file is properly closed
with open(file_path, 'rb') as audio_file:
r = requests.post(
url=url,
files={'audio': audio_file},
data=data,
headers={
'Ocp-Apim-Subscription-Key': api_key,
},
)
r.raise_for_status()
response = r.json()
# Extract transcript from response
if not response.get('combinedPhrases'):
raise ValueError("No transcription found in response")
# Get the full transcript from combinedPhrases
transcript = response['combinedPhrases'][0].get('text', '').strip()
if not transcript:
raise ValueError("Empty transcript in response")
data = {"text": transcript}
# Save transcript to json file (consistent with other providers)
transcript_file = f"{file_dir}/{id}.json"
with open(transcript_file, "w") as f:
json.dump(data, f)
log.debug(data)
return data
except (KeyError, IndexError, ValueError) as e:
log.exception("Error parsing Azure response")
raise HTTPException(
status_code=500,
detail=f"Failed to parse Azure response: {str(e)}",
)
except requests.exceptions.RequestException as e:
log.exception(e)
detail = None
try:
if r is not None and r.status_code != 200:
res = r.json()
if "error" in res:
detail = f"External: {res['error'].get('message', '')}"
except Exception:
detail = f"External: {e}"
raise HTTPException(
status_code=getattr(r, 'status_code', 500) if r else 500,
detail=detail if detail else "Open WebUI: Server Connection Error",
)
def compress_audio(file_path):
if os.path.getsize(file_path) > MAX_FILE_SIZE:

View File

@ -39,6 +39,9 @@
let STT_ENGINE = '';
let STT_MODEL = '';
let STT_WHISPER_MODEL = '';
let STT_AZURE_API_KEY = '';
let STT_AZURE_REGION = '';
let STT_AZURE_LOCALES = '';
let STT_DEEPGRAM_API_KEY = '';
let STT_WHISPER_MODEL_LOADING = false;
@ -103,12 +106,15 @@
AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
},
stt: {
OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
OPENAI_API_KEY: STT_OPENAI_API_KEY,
ENGINE: STT_ENGINE,
MODEL: STT_MODEL,
WHISPER_MODEL: STT_WHISPER_MODEL,
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY
OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
OPENAI_API_KEY: STT_OPENAI_API_KEY,
ENGINE: STT_ENGINE,
MODEL: STT_MODEL,
WHISPER_MODEL: STT_WHISPER_MODEL,
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
AZURE_API_KEY: STT_AZURE_API_KEY,
AZURE_REGION: STT_AZURE_REGION,
AZURE_LOCALES: STT_AZURE_LOCALES
}
});
@ -144,10 +150,13 @@
STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
STT_ENGINE = res.stt.ENGINE;
STT_MODEL = res.stt.MODEL;
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
STT_AZURE_REGION = res.stt.AZURE_REGION;
STT_AZURE_LOCALES = res.stt.AZURE_LOCALES;
STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
}
@ -180,7 +189,8 @@
<option value="openai">OpenAI</option>
<option value="web">{$i18n.t('Web API')}</option>
<option value="deepgram">Deepgram</option>
</select>
<option value="azure">Azure AI Speech</option>
</select>
</div>
</div>
@ -248,8 +258,35 @@
</a>
</div>
</div>
{:else if STT_ENGINE === 'azure'}
<div>
<div class="mt-1 flex gap-2 mb-1">
<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_AZURE_API_KEY} required />
<input
class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('Azure Region')}
bind:value={STT_AZURE_REGION}
required
/>
</div>
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={STT_AZURE_LOCALES}
placeholder={$i18n.t('e.g., en-US,ja-JP (leave blank for auto-detect)')}
/>
</div>
</div>
</div>
</div>
{:else if STT_ENGINE === ''}
<div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full">