mirror of
https://github.com/open-webui/open-webui
synced 2025-04-29 10:42:55 +00:00
feat: add Azure AI Speech STT provider
- Add Azure STT configuration variables for API key, region and locales - Implement Azure STT transcription endpoint with 200MB file size limit - Update Audio settings UI to include Azure STT configuration fields - Handle Azure API responses and error cases consistently
This commit is contained in:
parent
63533c9e3a
commit
9a55257c5b
@ -2472,6 +2472,24 @@ AUDIO_STT_MODEL = PersistentConfig(
|
||||
os.getenv("AUDIO_STT_MODEL", ""),
|
||||
)
|
||||
|
||||
AUDIO_STT_AZURE_API_KEY = PersistentConfig(
|
||||
"AUDIO_STT_AZURE_API_KEY",
|
||||
"audio.stt.azure.api_key",
|
||||
os.getenv("AUDIO_STT_AZURE_API_KEY", ""),
|
||||
)
|
||||
|
||||
AUDIO_STT_AZURE_REGION = PersistentConfig(
|
||||
"AUDIO_STT_AZURE_REGION",
|
||||
"audio.stt.azure.region",
|
||||
os.getenv("AUDIO_STT_AZURE_REGION", ""),
|
||||
)
|
||||
|
||||
AUDIO_STT_AZURE_LOCALES = PersistentConfig(
|
||||
"AUDIO_STT_AZURE_LOCALES",
|
||||
"audio.stt.azure.locales",
|
||||
os.getenv("AUDIO_STT_AZURE_LOCALES", ""),
|
||||
)
|
||||
|
||||
AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
|
||||
"AUDIO_TTS_OPENAI_API_BASE_URL",
|
||||
"audio.tts.openai.api_base_url",
|
||||
|
@ -148,6 +148,9 @@ from open_webui.config import (
|
||||
AUDIO_STT_MODEL,
|
||||
AUDIO_STT_OPENAI_API_BASE_URL,
|
||||
AUDIO_STT_OPENAI_API_KEY,
|
||||
AUDIO_STT_AZURE_API_KEY,
|
||||
AUDIO_STT_AZURE_REGION,
|
||||
AUDIO_STT_AZURE_LOCALES,
|
||||
AUDIO_TTS_API_KEY,
|
||||
AUDIO_TTS_ENGINE,
|
||||
AUDIO_TTS_MODEL,
|
||||
@ -778,6 +781,10 @@ app.state.config.STT_MODEL = AUDIO_STT_MODEL
|
||||
app.state.config.WHISPER_MODEL = WHISPER_MODEL
|
||||
app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
|
||||
|
||||
app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY
|
||||
app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION
|
||||
app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES
|
||||
|
||||
app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
|
||||
app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
|
||||
app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
|
||||
|
@ -50,6 +50,8 @@ router = APIRouter()
|
||||
# Constants
|
||||
MAX_FILE_SIZE_MB = 25
|
||||
MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
|
||||
AZURE_MAX_FILE_SIZE_MB = 200
|
||||
AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
log.setLevel(SRC_LOG_LEVELS["AUDIO"])
|
||||
@ -141,6 +143,9 @@ class STTConfigForm(BaseModel):
|
||||
MODEL: str
|
||||
WHISPER_MODEL: str
|
||||
DEEPGRAM_API_KEY: str
|
||||
AZURE_API_KEY: str
|
||||
AZURE_REGION: str
|
||||
AZURE_LOCALES: str
|
||||
|
||||
|
||||
class AudioConfigUpdateForm(BaseModel):
|
||||
@ -169,6 +174,9 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
|
||||
"MODEL": request.app.state.config.STT_MODEL,
|
||||
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
||||
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
||||
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
|
||||
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
|
||||
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
|
||||
},
|
||||
}
|
||||
|
||||
@ -195,6 +203,9 @@ async def update_audio_config(
|
||||
request.app.state.config.STT_MODEL = form_data.stt.MODEL
|
||||
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
|
||||
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
|
||||
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
|
||||
request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
|
||||
request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
|
||||
|
||||
if request.app.state.config.STT_ENGINE == "":
|
||||
request.app.state.faster_whisper_model = set_faster_whisper_model(
|
||||
@ -220,6 +231,9 @@ async def update_audio_config(
|
||||
"MODEL": request.app.state.config.STT_MODEL,
|
||||
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
||||
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
||||
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
|
||||
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
|
||||
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
|
||||
},
|
||||
}
|
||||
|
||||
@ -598,6 +612,107 @@ def transcribe(request: Request, file_path):
|
||||
detail = f"External: {e}"
|
||||
raise Exception(detail if detail else "Open WebUI: Server Connection Error")
|
||||
|
||||
elif request.app.state.config.STT_ENGINE == "azure":
|
||||
# Check file exists and size
|
||||
if not os.path.exists(file_path):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Audio file not found"
|
||||
)
|
||||
|
||||
# Check file size (Azure has a larger limit of 200MB)
|
||||
file_size = os.path.getsize(file_path)
|
||||
if file_size > AZURE_MAX_FILE_SIZE:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"File size exceeds Azure's limit of {AZURE_MAX_FILE_SIZE_MB}MB",
|
||||
)
|
||||
|
||||
api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
|
||||
region = request.app.state.config.AUDIO_STT_AZURE_REGION
|
||||
locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
|
||||
|
||||
# IF NO LOCALES, USE DEFAULTS
|
||||
if len(locales) < 2:
|
||||
locales = ['en-US', 'es-ES', 'es-MX', 'fr-FR', 'hi-IN',
|
||||
'it-IT','de-DE', 'en-GB', 'en-IN', 'ja-JP',
|
||||
'ko-KR', 'pt-BR', 'zh-CN']
|
||||
locales = ','.join(locales)
|
||||
|
||||
|
||||
if not api_key or not region:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Azure API key and region are required for Azure STT",
|
||||
)
|
||||
|
||||
r = None
|
||||
try:
|
||||
# Prepare the request
|
||||
data = {'definition': json.dumps({
|
||||
"locales": locales.split(','),
|
||||
"diarization": {"maxSpeakers": 3,"enabled": True}
|
||||
} if locales else {}
|
||||
)
|
||||
}
|
||||
url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
|
||||
|
||||
# Use context manager to ensure file is properly closed
|
||||
with open(file_path, 'rb') as audio_file:
|
||||
r = requests.post(
|
||||
url=url,
|
||||
files={'audio': audio_file},
|
||||
data=data,
|
||||
headers={
|
||||
'Ocp-Apim-Subscription-Key': api_key,
|
||||
},
|
||||
)
|
||||
|
||||
r.raise_for_status()
|
||||
response = r.json()
|
||||
|
||||
# Extract transcript from response
|
||||
if not response.get('combinedPhrases'):
|
||||
raise ValueError("No transcription found in response")
|
||||
|
||||
# Get the full transcript from combinedPhrases
|
||||
transcript = response['combinedPhrases'][0].get('text', '').strip()
|
||||
if not transcript:
|
||||
raise ValueError("Empty transcript in response")
|
||||
|
||||
data = {"text": transcript}
|
||||
|
||||
# Save transcript to json file (consistent with other providers)
|
||||
transcript_file = f"{file_dir}/{id}.json"
|
||||
with open(transcript_file, "w") as f:
|
||||
json.dump(data, f)
|
||||
|
||||
log.debug(data)
|
||||
return data
|
||||
|
||||
except (KeyError, IndexError, ValueError) as e:
|
||||
log.exception("Error parsing Azure response")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to parse Azure response: {str(e)}",
|
||||
)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.exception(e)
|
||||
detail = None
|
||||
|
||||
try:
|
||||
if r is not None and r.status_code != 200:
|
||||
res = r.json()
|
||||
if "error" in res:
|
||||
detail = f"External: {res['error'].get('message', '')}"
|
||||
except Exception:
|
||||
detail = f"External: {e}"
|
||||
|
||||
raise HTTPException(
|
||||
status_code=getattr(r, 'status_code', 500) if r else 500,
|
||||
detail=detail if detail else "Open WebUI: Server Connection Error",
|
||||
)
|
||||
|
||||
|
||||
def compress_audio(file_path):
|
||||
if os.path.getsize(file_path) > MAX_FILE_SIZE:
|
||||
|
@ -39,6 +39,9 @@
|
||||
let STT_ENGINE = '';
|
||||
let STT_MODEL = '';
|
||||
let STT_WHISPER_MODEL = '';
|
||||
let STT_AZURE_API_KEY = '';
|
||||
let STT_AZURE_REGION = '';
|
||||
let STT_AZURE_LOCALES = '';
|
||||
let STT_DEEPGRAM_API_KEY = '';
|
||||
|
||||
let STT_WHISPER_MODEL_LOADING = false;
|
||||
@ -103,12 +106,15 @@
|
||||
AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
|
||||
},
|
||||
stt: {
|
||||
OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
|
||||
OPENAI_API_KEY: STT_OPENAI_API_KEY,
|
||||
ENGINE: STT_ENGINE,
|
||||
MODEL: STT_MODEL,
|
||||
WHISPER_MODEL: STT_WHISPER_MODEL,
|
||||
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY
|
||||
OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
|
||||
OPENAI_API_KEY: STT_OPENAI_API_KEY,
|
||||
ENGINE: STT_ENGINE,
|
||||
MODEL: STT_MODEL,
|
||||
WHISPER_MODEL: STT_WHISPER_MODEL,
|
||||
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
|
||||
AZURE_API_KEY: STT_AZURE_API_KEY,
|
||||
AZURE_REGION: STT_AZURE_REGION,
|
||||
AZURE_LOCALES: STT_AZURE_LOCALES
|
||||
}
|
||||
});
|
||||
|
||||
@ -148,6 +154,9 @@
|
||||
STT_ENGINE = res.stt.ENGINE;
|
||||
STT_MODEL = res.stt.MODEL;
|
||||
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
|
||||
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
|
||||
STT_AZURE_REGION = res.stt.AZURE_REGION;
|
||||
STT_AZURE_LOCALES = res.stt.AZURE_LOCALES;
|
||||
STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
|
||||
}
|
||||
|
||||
@ -180,7 +189,8 @@
|
||||
<option value="openai">OpenAI</option>
|
||||
<option value="web">{$i18n.t('Web API')}</option>
|
||||
<option value="deepgram">Deepgram</option>
|
||||
</select>
|
||||
<option value="azure">Azure AI Speech</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -248,8 +258,35 @@
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
{:else if STT_ENGINE === 'azure'}
|
||||
<div>
|
||||
<div class="mt-1 flex gap-2 mb-1">
|
||||
<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_AZURE_API_KEY} required />
|
||||
<input
|
||||
class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
|
||||
placeholder={$i18n.t('Azure Region')}
|
||||
bind:value={STT_AZURE_REGION}
|
||||
required
|
||||
/>
|
||||
</div>
|
||||
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
|
||||
bind:value={STT_AZURE_LOCALES}
|
||||
placeholder={$i18n.t('e.g., en-US,ja-JP (leave blank for auto-detect)')}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{:else if STT_ENGINE === ''}
|
||||
<div>
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
|
||||
|
||||
<div class="flex w-full">
|
||||
|
Loading…
Reference in New Issue
Block a user