feat: custom stt content type

Co-Authored-By: Bryan Berns <berns@uwalumni.com>
This commit is contained in:
Timothy Jaeryang Baek 2025-06-16 16:13:40 +04:00
parent 6a5aac43df
commit 7a1afa9c66
5 changed files with 232 additions and 187 deletions

View File

@ -2906,6 +2906,12 @@ AUDIO_STT_MODEL = PersistentConfig(
os.getenv("AUDIO_STT_MODEL", ""),
)
AUDIO_STT_SUPPORTED_CONTENT_TYPES = PersistentConfig(
"AUDIO_STT_SUPPORTED_CONTENT_TYPES",
"audio.stt.supported_content_types",
os.getenv("AUDIO_STT_SUPPORTED_CONTENT_TYPES", "").split(","),
)
AUDIO_STT_AZURE_API_KEY = PersistentConfig(
"AUDIO_STT_AZURE_API_KEY",
"audio.stt.azure.api_key",

View File

@ -159,6 +159,7 @@ from open_webui.config import (
# Audio
AUDIO_STT_ENGINE,
AUDIO_STT_MODEL,
AUDIO_STT_SUPPORTED_CONTENT_TYPES,
AUDIO_STT_OPENAI_API_BASE_URL,
AUDIO_STT_OPENAI_API_KEY,
AUDIO_STT_AZURE_API_KEY,
@ -959,10 +960,12 @@ app.state.config.IMAGE_STEPS = IMAGE_STEPS
#
########################################
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.STT_ENGINE = AUDIO_STT_ENGINE
app.state.config.STT_MODEL = AUDIO_STT_MODEL
app.state.config.STT_SUPPORTED_CONTENT_TYPES = AUDIO_STT_SUPPORTED_CONTENT_TYPES
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.WHISPER_MODEL = WHISPER_MODEL
app.state.config.WHISPER_VAD_FILTER = WHISPER_VAD_FILTER

View File

@ -10,7 +10,7 @@ from pydub.silence import split_on_silence
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
from fnmatch import fnmatch
import aiohttp
import aiofiles
import requests
@ -168,6 +168,7 @@ class STTConfigForm(BaseModel):
OPENAI_API_KEY: str
ENGINE: str
MODEL: str
SUPPORTED_CONTENT_TYPES: list[str] = []
WHISPER_MODEL: str
DEEPGRAM_API_KEY: str
AZURE_API_KEY: str
@ -202,6 +203,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
"ENGINE": request.app.state.config.STT_ENGINE,
"MODEL": request.app.state.config.STT_MODEL,
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
@ -236,6 +238,10 @@ async def update_audio_config(
request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
request.app.state.config.STT_MODEL = form_data.stt.MODEL
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = (
form_data.stt.SUPPORTED_CONTENT_TYPES
)
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
@ -269,6 +275,7 @@ async def update_audio_config(
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
"ENGINE": request.app.state.config.STT_ENGINE,
"MODEL": request.app.state.config.STT_MODEL,
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
@ -910,10 +917,14 @@ def transcription(
):
log.info(f"file.content_type: {file.content_type}")
SUPPORTED_CONTENT_TYPES = {"video/webm"} # Extend if you add more video types!
if not (
file.content_type.startswith("audio/")
or file.content_type in SUPPORTED_CONTENT_TYPES
supported_content_types = request.app.state.config.STT_SUPPORTED_CONTENT_TYPES or [
"audio/*",
"video/webm",
]
if not any(
fnmatch(file.content_type, content_type)
for content_type in supported_content_types
):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,

View File

@ -155,9 +155,18 @@ def upload_file(
if process:
try:
if file.content_type:
if file.content_type.startswith("audio/") or file.content_type in {
"video/webm"
}:
stt_supported_content_types = (
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES
or [
"audio/*",
"video/webm",
]
)
if any(
fnmatch(file.content_type, content_type)
for content_type in stt_supported_content_types
):
file_path = Storage.get_file(file_path)
result = transcribe(request, file_path, file_metadata)

View File

@ -39,6 +39,7 @@
let STT_OPENAI_API_KEY = '';
let STT_ENGINE = '';
let STT_MODEL = '';
let STT_SUPPORTED_CONTENT_TYPES = '';
let STT_WHISPER_MODEL = '';
let STT_AZURE_API_KEY = '';
let STT_AZURE_REGION = '';
@ -114,6 +115,7 @@
OPENAI_API_KEY: STT_OPENAI_API_KEY,
ENGINE: STT_ENGINE,
MODEL: STT_MODEL,
SUPPORTED_CONTENT_TYPES: STT_SUPPORTED_CONTENT_TYPES.split(','),
WHISPER_MODEL: STT_WHISPER_MODEL,
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
AZURE_API_KEY: STT_AZURE_API_KEY,
@ -160,6 +162,7 @@
STT_ENGINE = res.stt.ENGINE;
STT_MODEL = res.stt.MODEL;
STT_SUPPORTED_CONTENT_TYPES = (res?.stt?.SUPPORTED_CONTENT_TYPES ?? []).join(',');
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
STT_AZURE_REGION = res.stt.AZURE_REGION;
@ -184,9 +187,11 @@
<div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full">
<div class="flex flex-col gap-3">
<div>
<div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
<div class=" mb-2.5 text-base font-medium">{$i18n.t('Speech-to-Text')}</div>
<div class=" py-0.5 flex w-full justify-between">
<hr class=" border-gray-100 dark:border-gray-850 my-2" />
<div class="mb-2 py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
<div class="flex items-center relative">
<select
@ -203,6 +208,19 @@
</div>
</div>
<div class="mb-2">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Supported MIME Types')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={STT_SUPPORTED_CONTENT_TYPES}
placeholder={$i18n.t('e.g., audio/wav,audio/mpeg (leave blank for defaults)')}
/>
</div>
</div>
</div>
{#if STT_ENGINE === 'openai'}
<div>
<div class="mt-1 flex gap-2 mb-1">
@ -220,7 +238,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -246,7 +264,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -280,7 +298,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Azure Region')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -293,7 +311,7 @@
</div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Language Locales')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -306,7 +324,7 @@
</div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Endpoint URL')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -319,7 +337,7 @@
</div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Max Speakers')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Max Speakers')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -333,7 +351,7 @@
</div>
{:else if STT_ENGINE === ''}
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full">
<div class="flex-1 mr-2">
@ -416,12 +434,12 @@
{/if}
</div>
<hr class="border-gray-100 dark:border-gray-850" />
<div>
<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
<div class=" mb-2.5 text-base font-medium">{$i18n.t('Text-to-Speech')}</div>
<div class=" py-0.5 flex w-full justify-between">
<hr class=" border-gray-100 dark:border-gray-850 my-2" />
<div class="mb-2 py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
<div class="flex items-center relative">
<select
@ -484,7 +502,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Azure Region')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -497,7 +515,7 @@
</div>
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Endpoint URL')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
@ -511,198 +529,196 @@
</div>
{/if}
<hr class="border-gray-100 dark:border-gray-850 my-2" />
{#if TTS_ENGINE === ''}
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<select
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
>
<option value="" selected={TTS_VOICE !== ''}>{$i18n.t('Default')}</option>
{#each voices as voice}
<option
value={voice.voiceURI}
class="bg-gray-100 dark:bg-gray-700"
selected={TTS_VOICE === voice.voiceURI}>{voice.name}</option
>
{/each}
</select>
</div>
</div>
</div>
{:else if TTS_ENGINE === 'transformers'}
<div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_MODEL}
placeholder="CMU ARCTIC speaker embedding name"
/>
<datalist id="model-list">
<option value="tts-1" />
</datalist>
</div>
</div>
<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
{$i18n.t(`Open WebUI uses SpeechT5 and CMU Arctic speaker embeddings.`)}
To learn more about SpeechT5,
<a
class=" hover:underline dark:text-gray-200 text-gray-800"
href="https://github.com/microsoft/SpeechT5"
target="_blank"
>
{$i18n.t(`click here`, {
name: 'SpeechT5'
})}.
</a>
To see the available CMU Arctic speaker embeddings,
<a
class=" hover:underline dark:text-gray-200 text-gray-800"
href="https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors"
target="_blank"
>
{$i18n.t(`click here`)}.
</a>
</div>
</div>
{:else if TTS_ENGINE === 'openai'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
<div class="mb-2">
{#if TTS_ENGINE === ''}
<div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
<select
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Select a voice"
/>
<datalist id="voice-list">
>
<option value="" selected={TTS_VOICE !== ''}>{$i18n.t('Default')}</option>
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
<option
value={voice.voiceURI}
class="bg-gray-100 dark:bg-gray-700"
selected={TTS_VOICE === voice.voiceURI}>{voice.name}</option
>
{/each}
</datalist>
</select>
</div>
</div>
</div>
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
{:else if TTS_ENGINE === 'transformers'}
<div>
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="tts-model-list"
list="model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_MODEL}
placeholder="Select a model"
placeholder="CMU ARCTIC speaker embedding name"
/>
<datalist id="tts-model-list">
{#each models as model}
<option value={model.id} class="bg-gray-50 dark:bg-gray-700" />
{/each}
<datalist id="model-list">
<option value="tts-1" />
</datalist>
</div>
</div>
</div>
</div>
{:else if TTS_ENGINE === 'elevenlabs'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Select a voice"
/>
<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
{$i18n.t(`Open WebUI uses SpeechT5 and CMU Arctic speaker embeddings.`)}
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="tts-model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_MODEL}
placeholder="Select a model"
/>
To learn more about SpeechT5,
<datalist id="tts-model-list">
{#each models as model}
<option value={model.id} class="bg-gray-50 dark:bg-gray-700" />
{/each}
</datalist>
</div>
</div>
</div>
</div>
{:else if TTS_ENGINE === 'azure'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Select a voice"
/>
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">
{$i18n.t('Output format')}
<a
href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
class=" hover:underline dark:text-gray-200 text-gray-800"
href="https://github.com/microsoft/SpeechT5"
target="_blank"
>
<small>{$i18n.t('Available list')}</small>
{$i18n.t(`click here`, {
name: 'SpeechT5'
})}.
</a>
To see the available CMU Arctic speaker embeddings,
<a
class=" hover:underline dark:text-gray-200 text-gray-800"
href="https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors"
target="_blank"
>
{$i18n.t(`click here`)}.
</a>
</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="tts-model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT}
placeholder="Select a output format"
/>
</div>
{:else if TTS_ENGINE === 'openai'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Select a voice"
/>
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
<div class="w-full">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="tts-model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_MODEL}
placeholder="Select a model"
/>
<datalist id="tts-model-list">
{#each models as model}
<option value={model.id} class="bg-gray-50 dark:bg-gray-700" />
{/each}
</datalist>
</div>
</div>
</div>
</div>
</div>
{/if}
{:else if TTS_ENGINE === 'elevenlabs'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Select a voice"
/>
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
<div class="w-full">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="tts-model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_MODEL}
placeholder="Select a model"
/>
<datalist id="tts-model-list">
{#each models as model}
<option value={model.id} class="bg-gray-50 dark:bg-gray-700" />
{/each}
</datalist>
</div>
</div>
</div>
</div>
{:else if TTS_ENGINE === 'azure'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Select a voice"
/>
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
<div class="w-full">
<div class=" mb-1.5 text-xs font-medium">
{$i18n.t('Output format')}
<a
href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
target="_blank"
>
<small>{$i18n.t('Available list')}</small>
</a>
</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="tts-model-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT}
placeholder="Select a output format"
/>
</div>
</div>
</div>
</div>
{/if}
</div>
<div class="pt-0.5 flex w-full justify-between">
<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>