feat: custom stt content type

Co-Authored-By: Bryan Berns <berns@uwalumni.com>
This commit is contained in:
Timothy Jaeryang Baek
2025-06-16 16:13:40 +04:00
parent 6a5aac43df
commit 7a1afa9c66
5 changed files with 232 additions and 187 deletions

View File

@@ -2906,6 +2906,12 @@ AUDIO_STT_MODEL = PersistentConfig(
os.getenv("AUDIO_STT_MODEL", ""),
)
AUDIO_STT_SUPPORTED_CONTENT_TYPES = PersistentConfig(
"AUDIO_STT_SUPPORTED_CONTENT_TYPES",
"audio.stt.supported_content_types",
os.getenv("AUDIO_STT_SUPPORTED_CONTENT_TYPES", "").split(","),
)
AUDIO_STT_AZURE_API_KEY = PersistentConfig(
"AUDIO_STT_AZURE_API_KEY",
"audio.stt.azure.api_key",

View File

@@ -159,6 +159,7 @@ from open_webui.config import (
# Audio
AUDIO_STT_ENGINE,
AUDIO_STT_MODEL,
AUDIO_STT_SUPPORTED_CONTENT_TYPES,
AUDIO_STT_OPENAI_API_BASE_URL,
AUDIO_STT_OPENAI_API_KEY,
AUDIO_STT_AZURE_API_KEY,
@@ -959,10 +960,12 @@ app.state.config.IMAGE_STEPS = IMAGE_STEPS
#
########################################
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.STT_ENGINE = AUDIO_STT_ENGINE
app.state.config.STT_MODEL = AUDIO_STT_MODEL
app.state.config.STT_SUPPORTED_CONTENT_TYPES = AUDIO_STT_SUPPORTED_CONTENT_TYPES
app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.WHISPER_MODEL = WHISPER_MODEL
app.state.config.WHISPER_VAD_FILTER = WHISPER_VAD_FILTER

View File

@@ -10,7 +10,7 @@ from pydub.silence import split_on_silence
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
from fnmatch import fnmatch
import aiohttp
import aiofiles
import requests
@@ -168,6 +168,7 @@ class STTConfigForm(BaseModel):
OPENAI_API_KEY: str
ENGINE: str
MODEL: str
SUPPORTED_CONTENT_TYPES: list[str] = []
WHISPER_MODEL: str
DEEPGRAM_API_KEY: str
AZURE_API_KEY: str
@@ -202,6 +203,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
"ENGINE": request.app.state.config.STT_ENGINE,
"MODEL": request.app.state.config.STT_MODEL,
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
@@ -236,6 +238,10 @@ async def update_audio_config(
request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
request.app.state.config.STT_MODEL = form_data.stt.MODEL
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = (
form_data.stt.SUPPORTED_CONTENT_TYPES
)
request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
@@ -269,6 +275,7 @@ async def update_audio_config(
"OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
"ENGINE": request.app.state.config.STT_ENGINE,
"MODEL": request.app.state.config.STT_MODEL,
"SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
"WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
"DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
"AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
@@ -910,10 +917,14 @@ def transcription(
):
log.info(f"file.content_type: {file.content_type}")
SUPPORTED_CONTENT_TYPES = {"video/webm"} # Extend if you add more video types!
if not (
file.content_type.startswith("audio/")
or file.content_type in SUPPORTED_CONTENT_TYPES
supported_content_types = request.app.state.config.STT_SUPPORTED_CONTENT_TYPES or [
"audio/*",
"video/webm",
]
if not any(
fnmatch(file.content_type, content_type)
for content_type in supported_content_types
):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,

View File

@@ -155,9 +155,18 @@ def upload_file(
if process:
try:
if file.content_type:
if file.content_type.startswith("audio/") or file.content_type in {
"video/webm"
}:
stt_supported_content_types = (
request.app.state.config.STT_SUPPORTED_CONTENT_TYPES
or [
"audio/*",
"video/webm",
]
)
if any(
fnmatch(file.content_type, content_type)
for content_type in stt_supported_content_types
):
file_path = Storage.get_file(file_path)
result = transcribe(request, file_path, file_metadata)