diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 8f643ffd3..0eee533bd 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -19,6 +19,8 @@ from open_webui.config import ( AUDIO_TTS_OPENAI_API_KEY, AUDIO_TTS_SPLIT_ON, AUDIO_TTS_VOICE, + AUDIO_TTS_AZURE_SPEECH_REGION, + AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, CACHE_DIR, CORS_ALLOW_ORIGIN, WHISPER_MODEL, @@ -62,6 +64,9 @@ app.state.config.TTS_VOICE = AUDIO_TTS_VOICE app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON +app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION +app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT + # setting device type for whisper model whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu" log.info(f"whisper_device_type: {whisper_device_type}") @@ -78,6 +83,8 @@ class TTSConfigForm(BaseModel): MODEL: str VOICE: str SPLIT_ON: str + AZURE_SPEECH_REGION: str + AZURE_SPEECH_OUTPUT_FORMAT: str class STTConfigForm(BaseModel): @@ -130,6 +137,8 @@ async def get_audio_config(user=Depends(get_admin_user)): "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, "SPLIT_ON": app.state.config.TTS_SPLIT_ON, + "AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -151,6 +160,10 @@ async def update_audio_config( app.state.config.TTS_MODEL = form_data.tts.MODEL app.state.config.TTS_VOICE = form_data.tts.VOICE app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON + app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION + app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = ( + form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT + ) app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY @@ -166,6 +179,8 @@ async def update_audio_config( "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, "SPLIT_ON": app.state.config.TTS_SPLIT_ON, + "AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -301,6 +316,42 @@ async def speech(request: Request, user=Depends(get_verified_user)): detail=error_detail, ) + elif app.state.config.TTS_ENGINE == "azure": + payload = None + try: + payload = json.loads(body.decode("utf-8")) + except Exception as e: + log.exception(e) + raise HTTPException(status_code=400, detail="Invalid JSON payload") + + region = app.state.config.TTS_AZURE_SPEECH_REGION + language = app.state.config.TTS_VOICE + locale = "-".join(app.state.config.TTS_VOICE.split("-")[:1]) + output_format = app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" + + headers = { + "Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY, + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": output_format, + } + + data = f""" + {payload["input"]} + """ + + response = requests.post(url, headers=headers, data=data) + + if response.status_code == 200: + with open(file_path, "wb") as f: + f.write(response.content) + return FileResponse(file_path) + else: + log.error(f"Error synthesizing speech - {response.reason}") + raise HTTPException( + status_code=500, detail=f"Error synthesizing speech - {response.reason}" + ) + @app.post("/transcriptions") def transcribe( @@ -478,6 +529,21 @@ def get_available_voices() -> dict: except Exception: # Avoided @lru_cache with exception pass + elif app.state.config.TTS_ENGINE == "azure": + try: + region = app.state.config.TTS_AZURE_SPEECH_REGION + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list" + headers = {"Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY} + + response = requests.get(url, headers=headers) + response.raise_for_status() + voices = response.json() + for voice in voices: + ret[voice["ShortName"]] = ( + f"{voice['DisplayName']} ({voice['ShortName']})" + ) + except requests.RequestException as e: + log.error(f"Error fetching voices: {str(e)}") return ret diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 439e82e43..7ad10ccdc 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1472,3 +1472,17 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig( "audio.tts.split_on", os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"), ) + +AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_REGION", + "audio.tts.azure.speech_region", + os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"), +) + +AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", + "audio.tts.azure.speech_output_format", + os.getenv( + "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", "audio-24khz-160kbitrate-mono-mp3" + ), +) diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 1c114c9dd..040bc5e1a 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -31,6 +31,8 @@ let TTS_MODEL = ''; let TTS_VOICE = ''; let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION; + let TTS_AZURE_SPEECH_REGION = ''; + let TTS_AZURE_SPEECH_OUTPUT_FORMAT = ''; let STT_OPENAI_API_BASE_URL = ''; let STT_OPENAI_API_KEY = ''; @@ -87,7 +89,9 @@ ENGINE: TTS_ENGINE, MODEL: TTS_MODEL, VOICE: TTS_VOICE, - SPLIT_ON: TTS_SPLIT_ON + SPLIT_ON: TTS_SPLIT_ON, + AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, + AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, @@ -120,6 +124,9 @@ TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION; + TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT; + TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION; + STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; @@ -224,6 +231,7 @@ + @@ -252,6 +260,23 @@ /> + {:else if TTS_ENGINE === 'azure'} +
+
+ + +
+
{/if}
@@ -359,6 +384,49 @@ + {:else if TTS_ENGINE === 'azure'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+
+
+ {$i18n.t('Output format')} + + {$i18n.t('Available list')} + +
+
+
+ +
+
+
+
{/if}