From 351bbdb36c21c19180ceb12e17b8f0468e8633aa Mon Sep 17 00:00:00 2001 From: Pawel Ochman <pawel.ochman@amius.com> Date: Tue, 17 Sep 2024 08:47:30 +0100 Subject: [PATCH 1/6] Added Azure speach service option (UI) --- src/lib/components/admin/Settings/Audio.svelte | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 1c114c9dd..5a9e91271 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -224,6 +224,7 @@ <option value="">{$i18n.t('Web API')}</option> <option value="openai">{$i18n.t('OpenAI')}</option> <option value="elevenlabs">{$i18n.t('ElevenLabs')}</option> + <option value="azurespeechservice">{$i18n.t('Azure Speech service')}</option> </select> </div> </div> @@ -252,6 +253,17 @@ /> </div> </div> + {:else if TTS_ENGINE === 'azurespeechservice'} + <div> + <div class="mt-1 flex gap-2 mb-1"> + <input + class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + placeholder={$i18n.t('API Key')} + bind:value={TTS_API_KEY} + required + /> + </div> + </div> {/if} <hr class=" dark:border-gray-850 my-2" /> From d6b68f405e4383ea95e20c2bd2bf60b415f316e8 Mon Sep 17 00:00:00 2001 From: Pawel Ochman <pawel.ochman@amius.com> Date: Tue, 17 Sep 2024 09:13:10 +0100 Subject: [PATCH 2/6] added azure speech service support --- backend/open_webui/apps/audio/main.py | 24 ++++++++++++++++++++++++ backend/requirements.txt | 2 ++ 2 files changed, 26 insertions(+) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 8f643ffd3..54b5e7d79 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -301,6 +301,30 @@ async def speech(request: Request, user=Depends(get_verified_user)): detail=error_detail, ) + elif app.state.config.TTS_ENGINE == "azurespeechservice": + payload = None + try: + payload = json.loads(body.decode("utf-8")) + except Exception as e: + log.exception(e) + raise HTTPException(status_code=400, detail="Invalid JSON payload") + + import azure.cognitiveservices.speech as speechsdk + + config = speechsdk.SpeechConfig(subscription=app.state.config.TTS_API_KEY, region="uksouth") + speaker_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False, filename=str(file_path)) + + client = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=speaker_config) + result = client.speak_text(payload["input"]) + + if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + return FileResponse(file_path) + else: + raise HTTPException( + status_code=500, + detail=f"Error synthesizing speech - {result.reason}") + + @app.post("/transcriptions") def transcribe( diff --git a/backend/requirements.txt b/backend/requirements.txt index ba1252f56..6fa289b0a 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -79,6 +79,8 @@ extract_msg pydub duckduckgo-search~=6.2.11 +azure-cognitiveservices-speech==1.40.0 + ## Tests docker~=7.1.0 pytest~=8.3.2 From eacb69074e1f5c2a71fa09c8f5079c7ffef7743d Mon Sep 17 00:00:00 2001 From: Pawel Ochman <pawel.ochman@amius.com> Date: Wed, 18 Sep 2024 12:24:55 +0100 Subject: [PATCH 3/6] remove dependency and migrate to raw rest calls --- backend/open_webui/apps/audio/main.py | 27 ++++++++++++++++++++------- backend/requirements.txt | 2 -- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 54b5e7d79..bf6ff15e6 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -309,20 +309,33 @@ async def speech(request: Request, user=Depends(get_verified_user)): log.exception(e) raise HTTPException(status_code=400, detail="Invalid JSON payload") - import azure.cognitiveservices.speech as speechsdk + region = "uksouth" + language = "en-GB-SoniaNeural" + locale = "en-GB" + output_format = "audio-24khz-160kbitrate-mono-mp3" + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" - config = speechsdk.SpeechConfig(subscription=app.state.config.TTS_API_KEY, region="uksouth") - speaker_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False, filename=str(file_path)) + headers = { + 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY, + 'Content-Type': 'application/ssml+xml', + 'X-Microsoft-OutputFormat': output_format + } - client = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=speaker_config) - result = client.speak_text(payload["input"]) + data = f"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{locale}"> + <voice name="{language}">{payload["input"]}</voice> + </speak>""" - if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + response = requests.post(url, headers=headers, data=data) + + if response.status_code == 200: + with open(file_path, "wb") as f: + f.write(response.content) return FileResponse(file_path) else: + log.error(f"Error synthesizing speech - {response.reason}") raise HTTPException( status_code=500, - detail=f"Error synthesizing speech - {result.reason}") + detail=f"Error synthesizing speech - {response.reason}") diff --git a/backend/requirements.txt b/backend/requirements.txt index 6fa289b0a..ba1252f56 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -79,8 +79,6 @@ extract_msg pydub duckduckgo-search~=6.2.11 -azure-cognitiveservices-speech==1.40.0 - ## Tests docker~=7.1.0 pytest~=8.3.2 From 4d9677e8082737f9b78e0103e0df409587f7cd81 Mon Sep 17 00:00:00 2001 From: Pawel Ochman <pawel.ochman@amius.com> Date: Wed, 18 Sep 2024 14:13:42 +0100 Subject: [PATCH 4/6] Update configuration page, expose all Azure settings through ENV variables --- backend/open_webui/apps/audio/main.py | 37 ++++- backend/open_webui/config.py | 12 ++ .../components/admin/Settings/Audio.svelte | 133 ++++++++++++------ 3 files changed, 138 insertions(+), 44 deletions(-) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index bf6ff15e6..0d389daf2 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -19,6 +19,8 @@ from open_webui.config import ( AUDIO_TTS_OPENAI_API_KEY, AUDIO_TTS_SPLIT_ON, AUDIO_TTS_VOICE, + AUDIO_TTS_AZURE_SPEECH_REGION, + AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, CACHE_DIR, CORS_ALLOW_ORIGIN, WHISPER_MODEL, @@ -62,6 +64,9 @@ app.state.config.TTS_VOICE = AUDIO_TTS_VOICE app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON +app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION +app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT + # setting device type for whisper model whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu" log.info(f"whisper_device_type: {whisper_device_type}") @@ -78,6 +83,8 @@ class TTSConfigForm(BaseModel): MODEL: str VOICE: str SPLIT_ON: str + AZURE_SPEECH_REGION: str + AZURE_SPEECH_OUTPUT_FORMAT: str class STTConfigForm(BaseModel): @@ -130,6 +137,8 @@ async def get_audio_config(user=Depends(get_admin_user)): "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, "SPLIT_ON": app.state.config.TTS_SPLIT_ON, + "AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -151,6 +160,8 @@ async def update_audio_config( app.state.config.TTS_MODEL = form_data.tts.MODEL app.state.config.TTS_VOICE = form_data.tts.VOICE app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON + app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION + app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY @@ -166,6 +177,8 @@ async def update_audio_config( "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, "SPLIT_ON": app.state.config.TTS_SPLIT_ON, + "AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -309,10 +322,10 @@ async def speech(request: Request, user=Depends(get_verified_user)): log.exception(e) raise HTTPException(status_code=400, detail="Invalid JSON payload") - region = "uksouth" - language = "en-GB-SoniaNeural" - locale = "en-GB" - output_format = "audio-24khz-160kbitrate-mono-mp3" + region = app.state.config.TTS_AZURE_SPEECH_REGION + language = app.state.config.TTS_VOICE + locale = "-".join(app.state.config.TTS_VOICE.split("-")[:1]) + output_format = app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" headers = { @@ -515,6 +528,22 @@ def get_available_voices() -> dict: except Exception: # Avoided @lru_cache with exception pass + elif app.state.config.TTS_ENGINE == "azurespeechservice": + try: + region = app.state.config.TTS_AZURE_SPEECH_REGION + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list" + headers = { + 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + voices = response.json() + for voice in voices: + ret[voice['ShortName']] = f"{voice['DisplayName']} ({voice['ShortName']})" + except requests.RequestException as e: + log.error(f"Error fetching voices: {str(e)}") + return ret diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 439e82e43..c7c78b8e6 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1472,3 +1472,15 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig( "audio.tts.split_on", os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"), ) + +AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_REGION", + "audio.tts.azure_speech_region", + os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "uksouth"), +) + +AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", + "audio.tts.azure_speech_output_format", + os.getenv("AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", 'audio-24khz-160kbitrate-mono-mp3'), +) diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 5a9e91271..15db5a62d 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -31,6 +31,8 @@ let TTS_MODEL = ''; let TTS_VOICE = ''; let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION; + let TTS_AZURE_SPEECH_REGION = ''; + let TTS_AZURE_SPEECH_OUTPUT_FORMAT = ''; let STT_OPENAI_API_BASE_URL = ''; let STT_OPENAI_API_KEY = ''; @@ -87,7 +89,9 @@ ENGINE: TTS_ENGINE, MODEL: TTS_MODEL, VOICE: TTS_VOICE, - SPLIT_ON: TTS_SPLIT_ON + SPLIT_ON: TTS_SPLIT_ON, + AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, + AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, @@ -120,6 +124,9 @@ TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION; + TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT; + TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION; + STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; @@ -262,6 +269,12 @@ bind:value={TTS_API_KEY} required /> + <input + class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + placeholder={$i18n.t('Azure Region')} + bind:value={TTS_AZURE_SPEECH_REGION} + required + /> </div> </div> {/if} @@ -330,48 +343,88 @@ </div> </div> </div> - {:else if TTS_ENGINE === 'elevenlabs'} - <div class=" flex gap-2"> - <div class="w-full"> - <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> - <div class="flex w-full"> - <div class="flex-1"> - <input - list="voice-list" - class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" - bind:value={TTS_VOICE} - placeholder="Select a voice" - /> - - <datalist id="voice-list"> - {#each voices as voice} - <option value={voice.id}>{voice.name}</option> - {/each} - </datalist> + {:else if TTS_ENGINE === 'elevenlabs'} + <div class=" flex gap-2"> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="voice-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_VOICE} + placeholder="Select a voice" + /> + + <datalist id="voice-list"> + {#each voices as voice} + <option value={voice.id}>{voice.name}</option> + {/each} + </datalist> + </div> + </div> + </div> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="tts-model-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_MODEL} + placeholder="Select a model" + /> + + <datalist id="tts-model-list"> + {#each models as model} + <option value={model.id} /> + {/each} + </datalist> + </div> + </div> + </div> + </div> + {:else if TTS_ENGINE === 'azurespeechservice'} + <div class=" flex gap-2"> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="voice-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_VOICE} + placeholder="Select a voice" + /> + + <datalist id="voice-list"> + {#each voices as voice} + <option value={voice.id}>{voice.name}</option> + {/each} + </datalist> + </div> + </div> + </div> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium"> + {$i18n.t('Output format')} + <a href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs" target="_blank" > + <small>{$i18n.t('Available list')}</small> + </a> + </div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="tts-model-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT} + placeholder="Select a output format" + /> + </div> </div> </div> </div> - <div class="w-full"> - <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> - <div class="flex w-full"> - <div class="flex-1"> - <input - list="tts-model-list" - class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" - bind:value={TTS_MODEL} - placeholder="Select a model" - /> - - <datalist id="tts-model-list"> - {#each models as model} - <option value={model.id} /> - {/each} - </datalist> - </div> - </div> - </div> - </div> - {/if} + {/if} <hr class="dark:border-gray-850 my-2" /> From afa42dd2e4266fb4c371b403a6b73839deea4366 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" <timothyjrbeck@gmail.com> Date: Thu, 19 Sep 2024 02:40:54 +0200 Subject: [PATCH 5/6] refac --- backend/open_webui/apps/audio/main.py | 28 +-- .../components/admin/Settings/Audio.svelte | 169 +++++++++--------- 2 files changed, 100 insertions(+), 97 deletions(-) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 0d389daf2..0eee533bd 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -161,7 +161,9 @@ async def update_audio_config( app.state.config.TTS_VOICE = form_data.tts.VOICE app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION - app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT + app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = ( + form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT + ) app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY @@ -314,7 +316,7 @@ async def speech(request: Request, user=Depends(get_verified_user)): detail=error_detail, ) - elif app.state.config.TTS_ENGINE == "azurespeechservice": + elif app.state.config.TTS_ENGINE == "azure": payload = None try: payload = json.loads(body.decode("utf-8")) @@ -329,9 +331,9 @@ async def speech(request: Request, user=Depends(get_verified_user)): url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" headers = { - 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY, - 'Content-Type': 'application/ssml+xml', - 'X-Microsoft-OutputFormat': output_format + "Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY, + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": output_format, } data = f"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{locale}"> @@ -347,9 +349,8 @@ async def speech(request: Request, user=Depends(get_verified_user)): else: log.error(f"Error synthesizing speech - {response.reason}") raise HTTPException( - status_code=500, - detail=f"Error synthesizing speech - {response.reason}") - + status_code=500, detail=f"Error synthesizing speech - {response.reason}" + ) @app.post("/transcriptions") @@ -528,23 +529,22 @@ def get_available_voices() -> dict: except Exception: # Avoided @lru_cache with exception pass - elif app.state.config.TTS_ENGINE == "azurespeechservice": + elif app.state.config.TTS_ENGINE == "azure": try: region = app.state.config.TTS_AZURE_SPEECH_REGION url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list" - headers = { - 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY - } + headers = {"Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY} response = requests.get(url, headers=headers) response.raise_for_status() voices = response.json() for voice in voices: - ret[voice['ShortName']] = f"{voice['DisplayName']} ({voice['ShortName']})" + ret[voice["ShortName"]] = ( + f"{voice['DisplayName']} ({voice['ShortName']})" + ) except requests.RequestException as e: log.error(f"Error fetching voices: {str(e)}") - return ret diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 15db5a62d..040bc5e1a 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -91,7 +91,7 @@ VOICE: TTS_VOICE, SPLIT_ON: TTS_SPLIT_ON, AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, - AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT, + AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, @@ -231,7 +231,7 @@ <option value="">{$i18n.t('Web API')}</option> <option value="openai">{$i18n.t('OpenAI')}</option> <option value="elevenlabs">{$i18n.t('ElevenLabs')}</option> - <option value="azurespeechservice">{$i18n.t('Azure Speech service')}</option> + <option value="azure">{$i18n.t('Azure AI Speech')}</option> </select> </div> </div> @@ -260,7 +260,7 @@ /> </div> </div> - {:else if TTS_ENGINE === 'azurespeechservice'} + {:else if TTS_ENGINE === 'azure'} <div> <div class="mt-1 flex gap-2 mb-1"> <input @@ -276,7 +276,7 @@ required /> </div> - </div> + </div> {/if} <hr class=" dark:border-gray-850 my-2" /> @@ -343,88 +343,91 @@ </div> </div> </div> - {:else if TTS_ENGINE === 'elevenlabs'} - <div class=" flex gap-2"> - <div class="w-full"> - <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> - <div class="flex w-full"> - <div class="flex-1"> - <input - list="voice-list" - class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" - bind:value={TTS_VOICE} - placeholder="Select a voice" - /> - - <datalist id="voice-list"> - {#each voices as voice} - <option value={voice.id}>{voice.name}</option> - {/each} - </datalist> - </div> - </div> - </div> - <div class="w-full"> - <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> - <div class="flex w-full"> - <div class="flex-1"> - <input - list="tts-model-list" - class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" - bind:value={TTS_MODEL} - placeholder="Select a model" - /> - - <datalist id="tts-model-list"> - {#each models as model} - <option value={model.id} /> - {/each} - </datalist> - </div> - </div> - </div> - </div> - {:else if TTS_ENGINE === 'azurespeechservice'} - <div class=" flex gap-2"> - <div class="w-full"> - <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> - <div class="flex w-full"> - <div class="flex-1"> - <input - list="voice-list" - class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" - bind:value={TTS_VOICE} - placeholder="Select a voice" - /> - - <datalist id="voice-list"> - {#each voices as voice} - <option value={voice.id}>{voice.name}</option> - {/each} - </datalist> - </div> - </div> - </div> - <div class="w-full"> - <div class=" mb-1.5 text-sm font-medium"> - {$i18n.t('Output format')} - <a href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs" target="_blank" > - <small>{$i18n.t('Available list')}</small> - </a> - </div> - <div class="flex w-full"> - <div class="flex-1"> - <input - list="tts-model-list" - class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" - bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT} - placeholder="Select a output format" - /> - </div> + {:else if TTS_ENGINE === 'elevenlabs'} + <div class=" flex gap-2"> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="voice-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_VOICE} + placeholder="Select a voice" + /> + + <datalist id="voice-list"> + {#each voices as voice} + <option value={voice.id}>{voice.name}</option> + {/each} + </datalist> </div> </div> </div> - {/if} + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Model')}</div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="tts-model-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_MODEL} + placeholder="Select a model" + /> + + <datalist id="tts-model-list"> + {#each models as model} + <option value={model.id} /> + {/each} + </datalist> + </div> + </div> + </div> + </div> + {:else if TTS_ENGINE === 'azure'} + <div class=" flex gap-2"> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="voice-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_VOICE} + placeholder="Select a voice" + /> + + <datalist id="voice-list"> + {#each voices as voice} + <option value={voice.id}>{voice.name}</option> + {/each} + </datalist> + </div> + </div> + </div> + <div class="w-full"> + <div class=" mb-1.5 text-sm font-medium"> + {$i18n.t('Output format')} + <a + href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs" + target="_blank" + > + <small>{$i18n.t('Available list')}</small> + </a> + </div> + <div class="flex w-full"> + <div class="flex-1"> + <input + list="tts-model-list" + class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none" + bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT} + placeholder="Select a output format" + /> + </div> + </div> + </div> + </div> + {/if} <hr class="dark:border-gray-850 my-2" /> From b4f1a0b5a6c58984bd8162f3497902a0f89e3a13 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" <timothyjrbeck@gmail.com> Date: Thu, 19 Sep 2024 02:42:24 +0200 Subject: [PATCH 6/6] refac --- backend/open_webui/config.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index c7c78b8e6..7ad10ccdc 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1475,12 +1475,14 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig( AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig( "AUDIO_TTS_AZURE_SPEECH_REGION", - "audio.tts.azure_speech_region", - os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "uksouth"), + "audio.tts.azure.speech_region", + os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"), ) AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig( "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", - "audio.tts.azure_speech_output_format", - os.getenv("AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", 'audio-24khz-160kbitrate-mono-mp3'), + "audio.tts.azure.speech_output_format", + os.getenv( + "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", "audio-24khz-160kbitrate-mono-mp3" + ), )