mirror of
https://github.com/open-webui/open-webui
synced 2025-06-23 02:16:52 +00:00
feat: custom stt content type
Co-Authored-By: Bryan Berns <berns@uwalumni.com>
This commit is contained in:
parent
6a5aac43df
commit
7a1afa9c66
@ -2906,6 +2906,12 @@ AUDIO_STT_MODEL = PersistentConfig(
|
||||
os.getenv("AUDIO_STT_MODEL", ""),
|
||||
)
|
||||
|
||||
AUDIO_STT_SUPPORTED_CONTENT_TYPES = PersistentConfig(
|
||||
"AUDIO_STT_SUPPORTED_CONTENT_TYPES",
|
||||
"audio.stt.supported_content_types",
|
||||
os.getenv("AUDIO_STT_SUPPORTED_CONTENT_TYPES", "").split(","),
|
||||
)
|
||||
|
||||
AUDIO_STT_AZURE_API_KEY = PersistentConfig(
|
||||
"AUDIO_STT_AZURE_API_KEY",
|
||||
"audio.stt.azure.api_key",
|
||||
|
@ -159,6 +159,7 @@ from open_webui.config import (
|
||||
# Audio
|
||||
AUDIO_STT_ENGINE,
|
||||
AUDIO_STT_MODEL,
|
||||
AUDIO_STT_SUPPORTED_CONTENT_TYPES,
|
||||
AUDIO_STT_OPENAI_API_BASE_URL,
|
||||
AUDIO_STT_OPENAI_API_KEY,
|
||||
AUDIO_STT_AZURE_API_KEY,
|
||||
@ -959,10 +960,12 @@ app.state.config.IMAGE_STEPS = IMAGE_STEPS
|
||||
#
|
||||
########################################
|
||||
|
||||
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
|
||||
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
|
||||
app.state.config.STT_ENGINE = AUDIO_STT_ENGINE
|
||||
app.state.config.STT_MODEL = AUDIO_STT_MODEL
|
||||
app.state.config.STT_SUPPORTED_CONTENT_TYPES = AUDIO_STT_SUPPORTED_CONTENT_TYPES
|
||||
|
||||
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
|
||||
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
|
||||
|
||||
app.state.config.WHISPER_MODEL = WHISPER_MODEL
|
||||
app.state.config.WHISPER_VAD_FILTER = WHISPER_VAD_FILTER
|
||||
|
@ -10,7 +10,7 @@ from pydub.silence import split_on_silence
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Optional
|
||||
|
||||
|
||||
from fnmatch import fnmatch
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
import requests
|
||||
@ -168,6 +168,7 @@ class STTConfigForm(BaseModel):
|
||||
OPENAI_API_KEY: str
|
||||
ENGINE: str
|
||||
MODEL: str
|
||||
SUPPORTED_CONTENT_TYPES: list[str] = []
|
||||
WHISPER_MODEL: str
|
||||
DEEPGRAM_API_KEY: str
|
||||
AZURE_API_KEY: str
|
||||
@ -202,6 +203,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
|
||||
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
|
||||
"ENGINE": request.app.state.config.STT_ENGINE,
|
||||
"MODEL": request.app.state.config.STT_MODEL,
|
||||
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
|
||||
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
||||
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
||||
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
|
||||
@ -236,6 +238,10 @@ async def update_audio_config(
|
||||
request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
|
||||
request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
|
||||
request.app.state.config.STT_MODEL = form_data.stt.MODEL
|
||||
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = (
|
||||
form_data.stt.SUPPORTED_CONTENT_TYPES
|
||||
)
|
||||
|
||||
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
|
||||
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
|
||||
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
|
||||
@ -269,6 +275,7 @@ async def update_audio_config(
|
||||
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
|
||||
"ENGINE": request.app.state.config.STT_ENGINE,
|
||||
"MODEL": request.app.state.config.STT_MODEL,
|
||||
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
|
||||
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
|
||||
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
|
||||
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
|
||||
@ -910,10 +917,14 @@ def transcription(
|
||||
):
|
||||
log.info(f"file.content_type: {file.content_type}")
|
||||
|
||||
SUPPORTED_CONTENT_TYPES = {"video/webm"} # Extend if you add more video types!
|
||||
if not (
|
||||
file.content_type.startswith("audio/")
|
||||
or file.content_type in SUPPORTED_CONTENT_TYPES
|
||||
supported_content_types = request.app.state.config.STT_SUPPORTED_CONTENT_TYPES or [
|
||||
"audio/*",
|
||||
"video/webm",
|
||||
]
|
||||
|
||||
if not any(
|
||||
fnmatch(file.content_type, content_type)
|
||||
for content_type in supported_content_types
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
|
@ -155,9 +155,18 @@ def upload_file(
|
||||
if process:
|
||||
try:
|
||||
if file.content_type:
|
||||
if file.content_type.startswith("audio/") or file.content_type in {
|
||||
"video/webm"
|
||||
}:
|
||||
stt_supported_content_types = (
|
||||
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES
|
||||
or [
|
||||
"audio/*",
|
||||
"video/webm",
|
||||
]
|
||||
)
|
||||
|
||||
if any(
|
||||
fnmatch(file.content_type, content_type)
|
||||
for content_type in stt_supported_content_types
|
||||
):
|
||||
file_path = Storage.get_file(file_path)
|
||||
result = transcribe(request, file_path, file_metadata)
|
||||
|
||||
|
@ -39,6 +39,7 @@
|
||||
let STT_OPENAI_API_KEY = '';
|
||||
let STT_ENGINE = '';
|
||||
let STT_MODEL = '';
|
||||
let STT_SUPPORTED_CONTENT_TYPES = '';
|
||||
let STT_WHISPER_MODEL = '';
|
||||
let STT_AZURE_API_KEY = '';
|
||||
let STT_AZURE_REGION = '';
|
||||
@ -114,6 +115,7 @@
|
||||
OPENAI_API_KEY: STT_OPENAI_API_KEY,
|
||||
ENGINE: STT_ENGINE,
|
||||
MODEL: STT_MODEL,
|
||||
SUPPORTED_CONTENT_TYPES: STT_SUPPORTED_CONTENT_TYPES.split(','),
|
||||
WHISPER_MODEL: STT_WHISPER_MODEL,
|
||||
DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
|
||||
AZURE_API_KEY: STT_AZURE_API_KEY,
|
||||
@ -160,6 +162,7 @@
|
||||
|
||||
STT_ENGINE = res.stt.ENGINE;
|
||||
STT_MODEL = res.stt.MODEL;
|
||||
STT_SUPPORTED_CONTENT_TYPES = (res?.stt?.SUPPORTED_CONTENT_TYPES ?? []).join(',');
|
||||
STT_WHISPER_MODEL = res.stt.WHISPER_MODEL;
|
||||
STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
|
||||
STT_AZURE_REGION = res.stt.AZURE_REGION;
|
||||
@ -184,9 +187,11 @@
|
||||
<div class=" space-y-3 overflow-y-scroll scrollbar-hidden h-full">
|
||||
<div class="flex flex-col gap-3">
|
||||
<div>
|
||||
<div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
|
||||
<div class=" mb-2.5 text-base font-medium">{$i18n.t('Speech-to-Text')}</div>
|
||||
|
||||
<div class=" py-0.5 flex w-full justify-between">
|
||||
<hr class=" border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div class="mb-2 py-0.5 flex w-full justify-between">
|
||||
<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
|
||||
<div class="flex items-center relative">
|
||||
<select
|
||||
@ -203,6 +208,19 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mb-2">
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Supported MIME Types')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
|
||||
bind:value={STT_SUPPORTED_CONTENT_TYPES}
|
||||
placeholder={$i18n.t('e.g., audio/wav,audio/mpeg (leave blank for defaults)')}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{#if STT_ENGINE === 'openai'}
|
||||
<div>
|
||||
<div class="mt-1 flex gap-2 mb-1">
|
||||
@ -220,7 +238,7 @@
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -246,7 +264,7 @@
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -280,7 +298,7 @@
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Azure Region')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -293,7 +311,7 @@
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Language Locales')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -306,7 +324,7 @@
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Endpoint URL')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -319,7 +337,7 @@
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Max Speakers')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Max Speakers')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -333,7 +351,7 @@
|
||||
</div>
|
||||
{:else if STT_ENGINE === ''}
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('STT Model')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('STT Model')}</div>
|
||||
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1 mr-2">
|
||||
@ -416,12 +434,12 @@
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
<hr class="border-gray-100 dark:border-gray-850" />
|
||||
|
||||
<div>
|
||||
<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
|
||||
<div class=" mb-2.5 text-base font-medium">{$i18n.t('Text-to-Speech')}</div>
|
||||
|
||||
<div class=" py-0.5 flex w-full justify-between">
|
||||
<hr class=" border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div class="mb-2 py-0.5 flex w-full justify-between">
|
||||
<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
|
||||
<div class="flex items-center relative">
|
||||
<select
|
||||
@ -484,7 +502,7 @@
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Azure Region')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -497,7 +515,7 @@
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('Endpoint URL')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -511,11 +529,10 @@
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
|
||||
<div class="mb-2">
|
||||
{#if TTS_ENGINE === ''}
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<select
|
||||
@ -536,7 +553,7 @@
|
||||
</div>
|
||||
{:else if TTS_ENGINE === 'transformers'}
|
||||
<div>
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -578,7 +595,7 @@
|
||||
{:else if TTS_ENGINE === 'openai'}
|
||||
<div class=" flex gap-2">
|
||||
<div class="w-full">
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -597,7 +614,7 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="w-full">
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -619,7 +636,7 @@
|
||||
{:else if TTS_ENGINE === 'elevenlabs'}
|
||||
<div class=" flex gap-2">
|
||||
<div class="w-full">
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -638,7 +655,7 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="w-full">
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Model')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -660,7 +677,7 @@
|
||||
{:else if TTS_ENGINE === 'azure'}
|
||||
<div class=" flex gap-2">
|
||||
<div class="w-full">
|
||||
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class=" mb-1.5 text-xs font-medium">{$i18n.t('TTS Voice')}</div>
|
||||
<div class="flex w-full">
|
||||
<div class="flex-1">
|
||||
<input
|
||||
@ -679,7 +696,7 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="w-full">
|
||||
<div class=" mb-1.5 text-sm font-medium">
|
||||
<div class=" mb-1.5 text-xs font-medium">
|
||||
{$i18n.t('Output format')}
|
||||
<a
|
||||
href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
|
||||
@ -701,8 +718,7 @@
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<hr class="border-gray-100 dark:border-gray-850 my-2" />
|
||||
</div>
|
||||
|
||||
<div class="pt-0.5 flex w-full justify-between">
|
||||
<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
|
||||
|
Loading…
Reference in New Issue
Block a user