feat: custom stt content type

Co-Authored-By: Bryan Berns <berns@uwalumni.com>
This commit is contained in:
Timothy Jaeryang Baek 2025-06-16 16:13:40 +04:00
parent 6a5aac43df
commit 7a1afa9c66
5 changed files with 232 additions and 187 deletions

View File

@ -2906,6 +2906,12 @@ AUDIO_STT_MODEL = PersistentConfig(
os.getenv("AUDIO_STT_MODEL", ""), os.getenv("AUDIO_STT_MODEL", ""),
) )
AUDIO_STT_SUPPORTED_CONTENT_TYPES = PersistentConfig(
"AUDIO_STT_SUPPORTED_CONTENT_TYPES",
"audio.stt.supported_content_types",
os.getenv("AUDIO_STT_SUPPORTED_CONTENT_TYPES", "").split(","),
)
AUDIO_STT_AZURE_API_KEY = PersistentConfig( AUDIO_STT_AZURE_API_KEY = PersistentConfig(
"AUDIO_STT_AZURE_API_KEY", "AUDIO_STT_AZURE_API_KEY",
"audio.stt.azure.api_key", "audio.stt.azure.api_key",

View File

@ -159,6 +159,7 @@ from open_webui.config import (
# Audio # Audio
AUDIO_STT_ENGINE, AUDIO_STT_ENGINE,
AUDIO_STT_MODEL, AUDIO_STT_MODEL,
AUDIO_STT_SUPPORTED_CONTENT_TYPES,
AUDIO_STT_OPENAI_API_BASE_URL, AUDIO_STT_OPENAI_API_BASE_URL,
AUDIO_STT_OPENAI_API_KEY, AUDIO_STT_OPENAI_API_KEY,
AUDIO_STT_AZURE_API_KEY, AUDIO_STT_AZURE_API_KEY,
@ -959,10 +960,12 @@ app.state.config.IMAGE_STEPS = IMAGE_STEPS
# #
######################################## ########################################
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.STT_ENGINE = AUDIO_STT_ENGINE app.state.config.STT_ENGINE = AUDIO_STT_ENGINE
app.state.config.STT_MODEL = AUDIO_STT_MODEL app.state.config.STT_MODEL = AUDIO_STT_MODEL
app.state.config.STT_SUPPORTED_CONTENT_TYPES = AUDIO_STT_SUPPORTED_CONTENT_TYPES
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.WHISPER_MODEL = WHISPER_MODEL app.state.config.WHISPER_MODEL = WHISPER_MODEL
app.state.config.WHISPER_VAD_FILTER = WHISPER_VAD_FILTER app.state.config.WHISPER_VAD_FILTER = WHISPER_VAD_FILTER

View File

@ -10,7 +10,7 @@ from pydub.silence import split_on_silence
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Optional from typing import Optional
from fnmatch import fnmatch
import aiohttp import aiohttp
import aiofiles import aiofiles
import requests import requests
@ -168,6 +168,7 @@ class STTConfigForm(BaseModel):
OPENAI_API_KEY: str OPENAI_API_KEY: str
ENGINE: str ENGINE: str
MODEL: str MODEL: str
SUPPORTED_CONTENT_TYPES: list[str] = []
WHISPER_MODEL: str WHISPER_MODEL: str
DEEPGRAM_API_KEY: str DEEPGRAM_API_KEY: str
AZURE_API_KEY: str AZURE_API_KEY: str
@ -202,6 +203,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY, "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
"ENGINE": request.app.state.config.STT_ENGINE, "ENGINE": request.app.state.config.STT_ENGINE,
"MODEL": request.app.state.config.STT_MODEL, "MODEL": request.app.state.config.STT_MODEL,
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
@ -236,6 +238,10 @@ async def update_audio_config(
request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
request.app.state.config.STT_ENGINE = form_data.stt.ENGINE request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
request.app.state.config.STT_MODEL = form_data.stt.MODEL request.app.state.config.STT_MODEL = form_data.stt.MODEL
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = (
form_data.stt.SUPPORTED_CONTENT_TYPES
)
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
@ -269,6 +275,7 @@ async def update_audio_config(
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY, "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
"ENGINE": request.app.state.config.STT_ENGINE, "ENGINE": request.app.state.config.STT_ENGINE,
"MODEL": request.app.state.config.STT_MODEL, "MODEL": request.app.state.config.STT_MODEL,
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
@ -910,10 +917,14 @@ def transcription(
): ):
log.info(f"file.content_type: {file.content_type}") log.info(f"file.content_type: {file.content_type}")
SUPPORTED_CONTENT_TYPES = {"video/webm"} # Extend if you add more video types! supported_content_types = request.app.state.config.STT_SUPPORTED_CONTENT_TYPES or [
if not ( "audio/*",
file.content_type.startswith("audio/") "video/webm",
or file.content_type in SUPPORTED_CONTENT_TYPES ]
if not any(
fnmatch(file.content_type, content_type)
for content_type in supported_content_types
): ):
raise HTTPException( raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, status_code=status.HTTP_400_BAD_REQUEST,

View File

@ -155,9 +155,18 @@ def upload_file(
if process: if process:
try: try:
if file.content_type: if file.content_type:
if file.content_type.startswith("audio/") or file.content_type in { stt_supported_content_types = (
"video/webm" request.app.state.config.STT_SUPPORTED_CONTENT_TYPES
}: or [
"audio/*",
"video/webm",
]
)
if any(
fnmatch(file.content_type, content_type)
for content_type in stt_supported_content_types
):
file_path = Storage.get_file(file_path) file_path = Storage.get_file(file_path)
result = transcribe(request, file_path, file_metadata) result = transcribe(request, file_path, file_metadata)

View File

@ -39,6 +39,7 @@
let STT_OPENAI_API_KEY = ''; let STT_OPENAI_API_KEY = '';
let STT_ENGINE = ''; let STT_ENGINE = '';
let STT_MODEL = ''; let STT_MODEL = '';
let STT_SUPPORTED_CONTENT_TYPES = '';
let STT_WHISPER_MODEL = ''; let STT_WHISPER_MODEL = '';
let STT_AZURE_API_KEY = ''; let STT_AZURE_API_KEY = '';
let STT_AZURE_REGION = ''; let STT_AZURE_REGION = '';
@ -114,6 +115,7 @@
OPENAI_API_KEY: STT_OPENAI_API_KEY, OPENAI_API_KEY: STT_OPENAI_API_KEY,
ENGINE: STT_ENGINE, ENGINE: STT_ENGINE,
MODEL: STT_MODEL, MODEL: STT_MODEL,
SUPPORTED_CONTENT_TYPES: STT_SUPPORTED_CONTENT_TYPES.split(','),
WHISPER_MODEL: STT_WHISPER_MODEL, WHISPER_MODEL: STT_WHISPER_MODEL,
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY, DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
AZURE_API_KEY: STT_AZURE_API_KEY, AZURE_API_KEY: STT_AZURE_API_KEY,
@ -160,6 +162,7 @@
STT_ENGINE = res.stt.ENGINE; STT_ENGINE = res.stt.ENGINE;
STT_MODEL = res.stt.MODEL; STT_MODEL = res.stt.MODEL;
STT_SUPPORTED_CONTENT_TYPES = (res?.stt?.SUPPORTED_CONTENT_TYPES ?? []).join(',');
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL; STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY; STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
STT_AZURE_REGION = res.stt.AZURE_REGION; STT_AZURE_REGION = res.stt.AZURE_REGION;
@ -184,9 +187,11 @@
<div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full"> <div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full">
<div class="flex flex-col gap-3"> <div class="flex flex-col gap-3">
<div> <div>
<div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div> <div class=" mb-2.5 text-base font-medium">{$i18n.t('Speech-to-Text')}</div>
<div class=" py-0.5 flex w-full justify-between"> <hr class=" border-gray-100 dark:border-gray-850 my-2" />
<div class="mb-2 py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div> <div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<select <select
@ -203,6 +208,19 @@
</div> </div>
</div> </div>
<div class="mb-2">
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Supported MIME Types')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={STT_SUPPORTED_CONTENT_TYPES}
placeholder={$i18n.t('e.g., audio/wav,audio/mpeg (leave blank for defaults)')}
/>
</div>
</div>
</div>
{#if STT_ENGINE === 'openai'} {#if STT_ENGINE === 'openai'}
<div> <div>
<div class="mt-1 flex gap-2 mb-1"> <div class="mt-1 flex gap-2 mb-1">
@ -220,7 +238,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" /> <hr class="border-gray-100 dark:border-gray-850 my-2" />
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -246,7 +264,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" /> <hr class="border-gray-100 dark:border-gray-850 my-2" />
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -280,7 +298,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" /> <hr class="border-gray-100 dark:border-gray-850 my-2" />
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('Azure Region')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -293,7 +311,7 @@
</div> </div>
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('Language Locales')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -306,7 +324,7 @@
</div> </div>
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('Endpoint URL')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -319,7 +337,7 @@
</div> </div>
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Max Speakers')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('Max Speakers')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -333,7 +351,7 @@
</div> </div>
{:else if STT_ENGINE === ''} {:else if STT_ENGINE === ''}
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1 mr-2"> <div class="flex-1 mr-2">
@ -416,12 +434,12 @@
{/if} {/if}
</div> </div>
<hr class="border-gray-100 dark:border-gray-850" />
<div> <div>
<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div> <div class=" mb-2.5 text-base font-medium">{$i18n.t('Text-to-Speech')}</div>
<div class=" py-0.5 flex w-full justify-between"> <hr class=" border-gray-100 dark:border-gray-850 my-2" />
<div class="mb-2 py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div> <div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<select <select
@ -484,7 +502,7 @@
<hr class="border-gray-100 dark:border-gray-850 my-2" /> <hr class="border-gray-100 dark:border-gray-850 my-2" />
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('Azure Region')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -497,7 +515,7 @@
</div> </div>
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('Endpoint URL')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -511,11 +529,10 @@
</div> </div>
{/if} {/if}
<hr class="border-gray-100 dark:border-gray-850 my-2" /> <div class="mb-2">
{#if TTS_ENGINE === ''} {#if TTS_ENGINE === ''}
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<select <select
@ -536,7 +553,7 @@
</div> </div>
{:else if TTS_ENGINE === 'transformers'} {:else if TTS_ENGINE === 'transformers'}
<div> <div>
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -578,7 +595,7 @@
{:else if TTS_ENGINE === 'openai'} {:else if TTS_ENGINE === 'openai'}
<div class=" flex gap-2"> <div class=" flex gap-2">
<div class="w-full"> <div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -597,7 +614,7 @@
</div> </div>
</div> </div>
<div class="w-full"> <div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -619,7 +636,7 @@
{:else if TTS_ENGINE === 'elevenlabs'} {:else if TTS_ENGINE === 'elevenlabs'}
<div class=" flex gap-2"> <div class=" flex gap-2">
<div class="w-full"> <div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -638,7 +655,7 @@
</div> </div>
</div> </div>
<div class="w-full"> <div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -660,7 +677,7 @@
{:else if TTS_ENGINE === 'azure'} {:else if TTS_ENGINE === 'azure'}
<div class=" flex gap-2"> <div class=" flex gap-2">
<div class="w-full"> <div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> <div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1"> <div class="flex-1">
<input <input
@ -679,7 +696,7 @@
</div> </div>
</div> </div>
<div class="w-full"> <div class="w-full">
<div class=" mb-1.5 text-sm font-medium"> <div class=" mb-1.5 text-xs font-medium">
{$i18n.t('Output format')} {$i18n.t('Output format')}
<a <a
href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs" href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
@ -701,8 +718,7 @@
</div> </div>
</div> </div>
{/if} {/if}
</div>
<hr class="border-gray-100 dark:border-gray-850 my-2" />
<div class="pt-0.5 flex w-full justify-between"> <div class="pt-0.5 flex w-full justify-between">
<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div> <div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>