From b280f828b024bdbc5f124a44ea0918107643cbae Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Sat, 17 May 2025 02:51:28 +0400 Subject: [PATCH] enh: very long audio transcription --- backend/open_webui/routers/audio.py | 134 +++++++++++++++++++++++----- 1 file changed, 111 insertions(+), 23 deletions(-) diff --git a/backend/open_webui/routers/audio.py b/backend/open_webui/routers/audio.py index 7f67c65bd..484bc138a 100644 --- a/backend/open_webui/routers/audio.py +++ b/backend/open_webui/routers/audio.py @@ -7,6 +7,7 @@ from functools import lru_cache from pathlib import Path from pydub import AudioSegment from pydub.silence import split_on_silence +from concurrent.futures import ThreadPoolExecutor import aiohttp import aiofiles @@ -50,7 +51,7 @@ from open_webui.env import ( router = APIRouter() # Constants -MAX_FILE_SIZE_MB = 25 +MAX_FILE_SIZE_MB = 20 MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes AZURE_MAX_FILE_SIZE_MB = 200 AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes @@ -87,8 +88,6 @@ def get_audio_convert_format(file_path): and info.get("codec_tag_string") == "mp4a" ): return "mp4" - elif info.get("format_name") == "ogg": - return "ogg" except Exception as e: log.error(f"Error getting audio format: {e}") return False @@ -511,8 +510,7 @@ async def speech(request: Request, user=Depends(get_verified_user)): return FileResponse(file_path) -def transcribe(request: Request, file_path): - log.info(f"transcribe: {file_path}") +def transcription_handler(request, file_path): filename = os.path.basename(file_path) file_dir = os.path.dirname(file_path) id = filename.split(".")[0] @@ -775,24 +773,119 @@ def transcribe(request: Request, file_path): ) +def transcribe(request: Request, file_path): + log.info(f"transcribe: {file_path}") + + try: + file_path = compress_audio(file_path) + except Exception as e: + log.exception(e) + + # Always produce a list of chunk paths (could be one entry if small) + try: + chunk_paths = split_audio(file_path, MAX_FILE_SIZE) + print(f"Chunk paths: {chunk_paths}") + except Exception as e: + log.exception(e) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ERROR_MESSAGES.DEFAULT(e), + ) + + results = [] + try: + with ThreadPoolExecutor() as executor: + # Submit tasks for each chunk_path + futures = [ + executor.submit(transcription_handler, request, chunk_path) + for chunk_path in chunk_paths + ] + # Gather results as they complete + for future in futures: + try: + results.append(future.result()) + except Exception as transcribe_exc: + log.exception(f"Error transcribing chunk: {transcribe_exc}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Error during transcription.", + ) + finally: + # Clean up only the temporary chunks, never the original file + for chunk_path in chunk_paths: + if chunk_path != file_path and os.path.isfile(chunk_path): + try: + os.remove(chunk_path) + except Exception: + pass + + return { + "text": " ".join([result["text"] for result in results]), + } + + def compress_audio(file_path): if os.path.getsize(file_path) > MAX_FILE_SIZE: + id = os.path.splitext(os.path.basename(file_path))[ + 0 + ] # Handles names with multiple dots file_dir = os.path.dirname(file_path) + audio = AudioSegment.from_file(file_path) audio = audio.set_frame_rate(16000).set_channels(1) # Compress audio - compressed_path = f"{file_dir}/{id}_compressed.opus" - audio.export(compressed_path, format="opus", bitrate="32k") - log.debug(f"Compressed audio to {compressed_path}") - if ( - os.path.getsize(compressed_path) > MAX_FILE_SIZE - ): # Still larger than MAX_FILE_SIZE after compression - raise Exception(ERROR_MESSAGES.FILE_TOO_LARGE(size=f"{MAX_FILE_SIZE_MB}MB")) + compressed_path = os.path.join(file_dir, f"{id}_compressed.mp3") + audio.export(compressed_path, format="mp3", bitrate="32k") + # log.debug(f"Compressed audio to {compressed_path}") # Uncomment if log is defined + return compressed_path else: return file_path +def split_audio(file_path, max_bytes, format="mp3", bitrate="32k"): + """ + Splits audio into chunks not exceeding max_bytes. + Returns a list of chunk file paths. If audio fits, returns list with original path. + """ + file_size = os.path.getsize(file_path) + if file_size <= max_bytes: + return [file_path] # Nothing to split + + audio = AudioSegment.from_file(file_path) + duration_ms = len(audio) + orig_size = file_size + + approx_chunk_ms = max(int(duration_ms * (max_bytes / orig_size)) - 1000, 1000) + chunks = [] + start = 0 + i = 0 + + base, _ = os.path.splitext(file_path) + + while start < duration_ms: + end = min(start + approx_chunk_ms, duration_ms) + chunk = audio[start:end] + chunk_path = f"{base}_chunk_{i}.{format}" + chunk.export(chunk_path, format=format, bitrate=bitrate) + + # Reduce chunk duration if still too large + while os.path.getsize(chunk_path) > max_bytes and (end - start) > 5000: + end = start + ((end - start) // 2) + chunk = audio[start:end] + chunk.export(chunk_path, format=format, bitrate=bitrate) + + if os.path.getsize(chunk_path) > max_bytes: + os.remove(chunk_path) + raise Exception("Audio chunk cannot be reduced below max file size.") + + chunks.append(chunk_path) + start = end + i += 1 + + return chunks + + @router.post("/transcriptions") def transcription( request: Request, @@ -807,6 +900,7 @@ def transcription( "audio/ogg", "audio/x-m4a", "audio/webm", + "video/webm", ) if not file.content_type.startswith(supported_filetypes): @@ -830,19 +924,13 @@ def transcription( f.write(contents) try: - try: - file_path = compress_audio(file_path) - except Exception as e: - log.exception(e) + result = transcribe(request, file_path) - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT(e), - ) + return { + **result, + "filename": os.path.basename(file_path), + } - data = transcribe(request, file_path) - file_path = file_path.split("/")[-1] - return {**data, "filename": file_path} except Exception as e: log.exception(e)