mirror of
https://github.com/open-webui/open-webui
synced 2025-04-30 11:03:58 +00:00
feat: add Azure AI Speech STT provider
- Add Azure STT configuration variables for API key, region and locales - Implement Azure STT transcription endpoint with 200MB file size limit - Update Audio settings UI to include Azure STT configuration fields - Handle Azure API responses and error cases consistently
This commit is contained in:
parent
63533c9e3a
commit
9a55257c5b
@ -2472,6 +2472,24 @@ AUDIO_STT_MODEL = PersistentConfig(
|
|||||||
os.getenv("AUDIO_STT_MODEL", ""),
|
os.getenv("AUDIO_STT_MODEL", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
AUDIO_STT_AZURE_API_KEY = PersistentConfig(
|
||||||
|
"AUDIO_STT_AZURE_API_KEY",
|
||||||
|
"audio.stt.azure.api_key",
|
||||||
|
os.getenv("AUDIO_STT_AZURE_API_KEY", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
AUDIO_STT_AZURE_REGION = PersistentConfig(
|
||||||
|
"AUDIO_STT_AZURE_REGION",
|
||||||
|
"audio.stt.azure.region",
|
||||||
|
os.getenv("AUDIO_STT_AZURE_REGION", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
AUDIO_STT_AZURE_LOCALES = PersistentConfig(
|
||||||
|
"AUDIO_STT_AZURE_LOCALES",
|
||||||
|
"audio.stt.azure.locales",
|
||||||
|
os.getenv("AUDIO_STT_AZURE_LOCALES", ""),
|
||||||
|
)
|
||||||
|
|
||||||
AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
|
AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
|
||||||
"AUDIO_TTS_OPENAI_API_BASE_URL",
|
"AUDIO_TTS_OPENAI_API_BASE_URL",
|
||||||
"audio.tts.openai.api_base_url",
|
"audio.tts.openai.api_base_url",
|
||||||
|
@ -148,6 +148,9 @@ from open_webui.config import (
|
|||||||
AUDIO_STT_MODEL,
|
AUDIO_STT_MODEL,
|
||||||
AUDIO_STT_OPENAI_API_BASE_URL,
|
AUDIO_STT_OPENAI_API_BASE_URL,
|
||||||
AUDIO_STT_OPENAI_API_KEY,
|
AUDIO_STT_OPENAI_API_KEY,
|
||||||
|
AUDIO_STT_AZURE_API_KEY,
|
||||||
|
AUDIO_STT_AZURE_REGION,
|
||||||
|
AUDIO_STT_AZURE_LOCALES,
|
||||||
AUDIO_TTS_API_KEY,
|
AUDIO_TTS_API_KEY,
|
||||||
AUDIO_TTS_ENGINE,
|
AUDIO_TTS_ENGINE,
|
||||||
AUDIO_TTS_MODEL,
|
AUDIO_TTS_MODEL,
|
||||||
@ -778,6 +781,10 @@ app.state.config.STT_MODEL = AUDIO_STT_MODEL
|
|||||||
app.state.config.WHISPER_MODEL = WHISPER_MODEL
|
app.state.config.WHISPER_MODEL = WHISPER_MODEL
|
||||||
app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
|
app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
|
||||||
|
|
||||||
|
app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY
|
||||||
|
app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION
|
||||||
|
app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES
|
||||||
|
|
||||||
app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
|
app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
|
||||||
app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
|
app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
|
||||||
app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
|
app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
|
||||||
|
@ -50,6 +50,8 @@ router = APIRouter()
|
|||||||
# Constants
|
# Constants
|
||||||
MAX_FILE_SIZE_MB = 25
|
MAX_FILE_SIZE_MB = 25
|
||||||
MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
|
MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
|
||||||
|
AZURE_MAX_FILE_SIZE_MB = 200
|
||||||
|
AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
log.setLevel(SRC_LOG_LEVELS["AUDIO"])
|
log.setLevel(SRC_LOG_LEVELS["AUDIO"])
|
||||||
@ -141,6 +143,9 @@ class STTConfigForm(BaseModel):
|
|||||||
MODEL: str
|
MODEL: str
|
||||||
WHISPER_MODEL: str
|
WHISPER_MODEL: str
|
||||||
DEEPGRAM_API_KEY: str
|
DEEPGRAM_API_KEY: str
|
||||||
|
AZURE_API_KEY: str
|
||||||
|
AZURE_REGION: str
|
||||||
|
AZURE_LOCALES: str
|
||||||
|
|
||||||
|
|
||||||
class AudioConfigUpdateForm(BaseModel):
|
class AudioConfigUpdateForm(BaseModel):
|
||||||
@ -169,6 +174,9 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
|
|||||||
"MODEL": request.app.state.config.STT_MODEL,
|
"MODEL": request.app.state.config.STT_MODEL,
|
||||||
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
||||||
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
||||||
|
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
|
||||||
|
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
|
||||||
|
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,6 +203,9 @@ async def update_audio_config(
|
|||||||
request.app.state.config.STT_MODEL = form_data.stt.MODEL
|
request.app.state.config.STT_MODEL = form_data.stt.MODEL
|
||||||
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
|
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
|
||||||
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
|
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
|
||||||
|
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
|
||||||
|
request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
|
||||||
|
request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
|
||||||
|
|
||||||
if request.app.state.config.STT_ENGINE == "":
|
if request.app.state.config.STT_ENGINE == "":
|
||||||
request.app.state.faster_whisper_model = set_faster_whisper_model(
|
request.app.state.faster_whisper_model = set_faster_whisper_model(
|
||||||
@ -220,6 +231,9 @@ async def update_audio_config(
|
|||||||
"MODEL": request.app.state.config.STT_MODEL,
|
"MODEL": request.app.state.config.STT_MODEL,
|
||||||
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
||||||
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
||||||
|
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
|
||||||
|
"AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
|
||||||
|
"AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -598,6 +612,107 @@ def transcribe(request: Request, file_path):
|
|||||||
detail = f"External: {e}"
|
detail = f"External: {e}"
|
||||||
raise Exception(detail if detail else "Open WebUI: Server Connection Error")
|
raise Exception(detail if detail else "Open WebUI: Server Connection Error")
|
||||||
|
|
||||||
|
elif request.app.state.config.STT_ENGINE == "azure":
|
||||||
|
# Check file exists and size
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Audio file not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check file size (Azure has a larger limit of 200MB)
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
if file_size > AZURE_MAX_FILE_SIZE:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"File size exceeds Azure's limit of {AZURE_MAX_FILE_SIZE_MB}MB",
|
||||||
|
)
|
||||||
|
|
||||||
|
api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
|
||||||
|
region = request.app.state.config.AUDIO_STT_AZURE_REGION
|
||||||
|
locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
|
||||||
|
|
||||||
|
# IF NO LOCALES, USE DEFAULTS
|
||||||
|
if len(locales) < 2:
|
||||||
|
locales = ['en-US', 'es-ES', 'es-MX', 'fr-FR', 'hi-IN',
|
||||||
|
'it-IT','de-DE', 'en-GB', 'en-IN', 'ja-JP',
|
||||||
|
'ko-KR', 'pt-BR', 'zh-CN']
|
||||||
|
locales = ','.join(locales)
|
||||||
|
|
||||||
|
|
||||||
|
if not api_key or not region:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Azure API key and region are required for Azure STT",
|
||||||
|
)
|
||||||
|
|
||||||
|
r = None
|
||||||
|
try:
|
||||||
|
# Prepare the request
|
||||||
|
data = {'definition': json.dumps({
|
||||||
|
"locales": locales.split(','),
|
||||||
|
"diarization": {"maxSpeakers": 3,"enabled": True}
|
||||||
|
} if locales else {}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
|
||||||
|
|
||||||
|
# Use context manager to ensure file is properly closed
|
||||||
|
with open(file_path, 'rb') as audio_file:
|
||||||
|
r = requests.post(
|
||||||
|
url=url,
|
||||||
|
files={'audio': audio_file},
|
||||||
|
data=data,
|
||||||
|
headers={
|
||||||
|
'Ocp-Apim-Subscription-Key': api_key,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
r.raise_for_status()
|
||||||
|
response = r.json()
|
||||||
|
|
||||||
|
# Extract transcript from response
|
||||||
|
if not response.get('combinedPhrases'):
|
||||||
|
raise ValueError("No transcription found in response")
|
||||||
|
|
||||||
|
# Get the full transcript from combinedPhrases
|
||||||
|
transcript = response['combinedPhrases'][0].get('text', '').strip()
|
||||||
|
if not transcript:
|
||||||
|
raise ValueError("Empty transcript in response")
|
||||||
|
|
||||||
|
data = {"text": transcript}
|
||||||
|
|
||||||
|
# Save transcript to json file (consistent with other providers)
|
||||||
|
transcript_file = f"{file_dir}/{id}.json"
|
||||||
|
with open(transcript_file, "w") as f:
|
||||||
|
json.dump(data, f)
|
||||||
|
|
||||||
|
log.debug(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
except (KeyError, IndexError, ValueError) as e:
|
||||||
|
log.exception("Error parsing Azure response")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Failed to parse Azure response: {str(e)}",
|
||||||
|
)
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
log.exception(e)
|
||||||
|
detail = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
if r is not None and r.status_code != 200:
|
||||||
|
res = r.json()
|
||||||
|
if "error" in res:
|
||||||
|
detail = f"External: {res['error'].get('message', '')}"
|
||||||
|
except Exception:
|
||||||
|
detail = f"External: {e}"
|
||||||
|
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=getattr(r, 'status_code', 500) if r else 500,
|
||||||
|
detail=detail if detail else "Open WebUI: Server Connection Error",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def compress_audio(file_path):
|
def compress_audio(file_path):
|
||||||
if os.path.getsize(file_path) > MAX_FILE_SIZE:
|
if os.path.getsize(file_path) > MAX_FILE_SIZE:
|
||||||
|
@ -39,6 +39,9 @@
|
|||||||
let STT_ENGINE = '';
|
let STT_ENGINE = '';
|
||||||
let STT_MODEL = '';
|
let STT_MODEL = '';
|
||||||
let STT_WHISPER_MODEL = '';
|
let STT_WHISPER_MODEL = '';
|
||||||
|
let STT_AZURE_API_KEY = '';
|
||||||
|
let STT_AZURE_REGION = '';
|
||||||
|
let STT_AZURE_LOCALES = '';
|
||||||
let STT_DEEPGRAM_API_KEY = '';
|
let STT_DEEPGRAM_API_KEY = '';
|
||||||
|
|
||||||
let STT_WHISPER_MODEL_LOADING = false;
|
let STT_WHISPER_MODEL_LOADING = false;
|
||||||
@ -108,7 +111,10 @@
|
|||||||
ENGINE: STT_ENGINE,
|
ENGINE: STT_ENGINE,
|
||||||
MODEL: STT_MODEL,
|
MODEL: STT_MODEL,
|
||||||
WHISPER_MODEL: STT_WHISPER_MODEL,
|
WHISPER_MODEL: STT_WHISPER_MODEL,
|
||||||
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY
|
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
|
||||||
|
AZURE_API_KEY: STT_AZURE_API_KEY,
|
||||||
|
AZURE_REGION: STT_AZURE_REGION,
|
||||||
|
AZURE_LOCALES: STT_AZURE_LOCALES
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -148,6 +154,9 @@
|
|||||||
STT_ENGINE = res.stt.ENGINE;
|
STT_ENGINE = res.stt.ENGINE;
|
||||||
STT_MODEL = res.stt.MODEL;
|
STT_MODEL = res.stt.MODEL;
|
||||||
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
|
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
|
||||||
|
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
|
||||||
|
STT_AZURE_REGION = res.stt.AZURE_REGION;
|
||||||
|
STT_AZURE_LOCALES = res.stt.AZURE_LOCALES;
|
||||||
STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
|
STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,6 +189,7 @@
|
|||||||
<option value="openai">OpenAI</option>
|
<option value="openai">OpenAI</option>
|
||||||
<option value="web">{$i18n.t('Web API')}</option>
|
<option value="web">{$i18n.t('Web API')}</option>
|
||||||
<option value="deepgram">Deepgram</option>
|
<option value="deepgram">Deepgram</option>
|
||||||
|
<option value="azure">Azure AI Speech</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -248,6 +258,33 @@
|
|||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{:else if STT_ENGINE === 'azure'}
|
||||||
|
<div>
|
||||||
|
<div class="mt-1 flex gap-2 mb-1">
|
||||||
|
<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={STT_AZURE_API_KEY} required />
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
|
||||||
|
placeholder={$i18n.t('Azure Region')}
|
||||||
|
bind:value={STT_AZURE_REGION}
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
|
||||||
|
<div class="flex w-full">
|
||||||
|
<div class="flex-1">
|
||||||
|
<input
|
||||||
|
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
|
||||||
|
bind:value={STT_AZURE_LOCALES}
|
||||||
|
placeholder={$i18n.t('e.g., en-US,ja-JP (leave blank for auto-detect)')}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
{:else if STT_ENGINE === ''}
|
{:else if STT_ENGINE === ''}
|
||||||
<div>
|
<div>
|
||||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
|
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user