diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py index f866d867f..c565bf481 100644 --- a/backend/apps/audio/main.py +++ b/backend/apps/audio/main.py @@ -43,6 +43,7 @@ from config import ( AUDIO_STT_OPENAI_API_KEY, AUDIO_TTS_OPENAI_API_BASE_URL, AUDIO_TTS_OPENAI_API_KEY, + AUDIO_TTS_API_KEY, AUDIO_STT_ENGINE, AUDIO_STT_MODEL, AUDIO_TTS_ENGINE, @@ -75,6 +76,7 @@ app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE app.state.config.TTS_MODEL = AUDIO_TTS_MODEL app.state.config.TTS_VOICE = AUDIO_TTS_VOICE +app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY # setting device type for whisper model whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu" @@ -87,6 +89,7 @@ SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True) class TTSConfigForm(BaseModel): OPENAI_API_BASE_URL: str OPENAI_API_KEY: str + API_KEY: str ENGINE: str MODEL: str VOICE: str @@ -137,6 +140,7 @@ async def get_audio_config(user=Depends(get_admin_user)): "tts": { "OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL, "OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY, + "API_KEY": app.state.config.TTS_API_KEY, "ENGINE": app.state.config.TTS_ENGINE, "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, @@ -156,6 +160,7 @@ async def update_audio_config( ): app.state.config.TTS_OPENAI_API_BASE_URL = form_data.tts.OPENAI_API_BASE_URL app.state.config.TTS_OPENAI_API_KEY = form_data.tts.OPENAI_API_KEY + app.state.config.TTS_API_KEY = form_data.tts.API_KEY app.state.config.TTS_ENGINE = form_data.tts.ENGINE app.state.config.TTS_MODEL = form_data.tts.MODEL app.state.config.TTS_VOICE = form_data.tts.VOICE @@ -169,6 +174,7 @@ async def update_audio_config( "tts": { "OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL, "OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY, + "API_KEY": app.state.config.TTS_API_KEY, "ENGINE": app.state.config.TTS_ENGINE, "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, @@ -194,55 +200,111 @@ async def speech(request: Request, user=Depends(get_verified_user)): if file_path.is_file(): return FileResponse(file_path) - headers = {} - headers["Authorization"] = f"Bearer {app.state.config.TTS_OPENAI_API_KEY}" - headers["Content-Type"] = "application/json" + if app.state.config.TTS_ENGINE == "openai": + headers = {} + headers["Authorization"] = f"Bearer {app.state.config.TTS_OPENAI_API_KEY}" + headers["Content-Type"] = "application/json" - try: - body = body.decode("utf-8") - body = json.loads(body) - body["model"] = app.state.config.TTS_MODEL - body = json.dumps(body).encode("utf-8") - except Exception as e: - pass + try: + body = body.decode("utf-8") + body = json.loads(body) + body["model"] = app.state.config.TTS_MODEL + body = json.dumps(body).encode("utf-8") + except Exception as e: + pass - r = None - try: - r = requests.post( - url=f"{app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech", - data=body, - headers=headers, - stream=True, - ) + r = None + try: + r = requests.post( + url=f"{app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech", + data=body, + headers=headers, + stream=True, + ) - r.raise_for_status() + r.raise_for_status() - # Save the streaming content to a file - with open(file_path, "wb") as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) + # Save the streaming content to a file + with open(file_path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) - with open(file_body_path, "w") as f: - json.dump(json.loads(body.decode("utf-8")), f) + with open(file_body_path, "w") as f: + json.dump(json.loads(body.decode("utf-8")), f) - # Return the saved file - return FileResponse(file_path) + # Return the saved file + return FileResponse(file_path) - except Exception as e: - log.exception(e) - error_detail = "Open WebUI: Server Connection Error" - if r is not None: - try: - res = r.json() - if "error" in res: - error_detail = f"External: {res['error']['message']}" - except: - error_detail = f"External: {e}" + except Exception as e: + log.exception(e) + error_detail = "Open WebUI: Server Connection Error" + if r is not None: + try: + res = r.json() + if "error" in res: + error_detail = f"External: {res['error']['message']}" + except: + error_detail = f"External: {e}" - raise HTTPException( - status_code=r.status_code if r != None else 500, - detail=error_detail, - ) + raise HTTPException( + status_code=r.status_code if r != None else 500, + detail=error_detail, + ) + + elif app.state.config.TTS_ENGINE == "elevenlabs": + + payload = None + try: + payload = json.loads(body.decode("utf-8")) + except Exception as e: + log.exception(e) + pass + + url = f"https://api.elevenlabs.io/v1/text-to-speech/{payload['voice']}" + + headers = { + "Accept": "audio/mpeg", + "Content-Type": "application/json", + "xi-api-key": app.state.config.TTS_API_KEY, + } + + data = { + "text": payload["input"], + "model_id": app.state.config.TTS_MODEL, + "voice_settings": {"stability": 0.5, "similarity_boost": 0.5}, + } + + try: + r = requests.post(url, json=data, headers=headers) + + r.raise_for_status() + + # Save the streaming content to a file + with open(file_path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + with open(file_body_path, "w") as f: + json.dump(json.loads(body.decode("utf-8")), f) + + # Return the saved file + return FileResponse(file_path) + + except Exception as e: + log.exception(e) + error_detail = "Open WebUI: Server Connection Error" + if r is not None: + try: + res = r.json() + if "error" in res: + error_detail = f"External: {res['error']['message']}" + except: + error_detail = f"External: {e}" + + raise HTTPException( + status_code=r.status_code if r != None else 500, + detail=error_detail, + ) @app.post("/transcriptions") diff --git a/backend/config.py b/backend/config.py index 0ee8f2aac..be64f6e6d 100644 --- a/backend/config.py +++ b/backend/config.py @@ -1339,6 +1339,11 @@ AUDIO_TTS_OPENAI_API_KEY = PersistentConfig( os.getenv("AUDIO_TTS_OPENAI_API_KEY", OPENAI_API_KEY), ) +AUDIO_TTS_API_KEY = PersistentConfig( + "AUDIO_TTS_API_KEY", + "audio.tts.api_key", + os.getenv("AUDIO_TTS_API_KEY", ""), +) AUDIO_TTS_ENGINE = PersistentConfig( "AUDIO_TTS_ENGINE", diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 5ae802202..8be5ee3ef 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -16,6 +16,7 @@ let TTS_OPENAI_API_BASE_URL = ''; let TTS_OPENAI_API_KEY = ''; + let TTS_API_KEY = ''; let TTS_ENGINE = ''; let TTS_MODEL = ''; let TTS_VOICE = ''; @@ -60,6 +61,7 @@ tts: { OPENAI_API_BASE_URL: TTS_OPENAI_API_BASE_URL, OPENAI_API_KEY: TTS_OPENAI_API_KEY, + TTS_API_KEY: TTS_API_KEY, ENGINE: TTS_ENGINE, MODEL: TTS_MODEL, VOICE: TTS_VOICE @@ -86,6 +88,7 @@ console.log(res); TTS_OPENAI_API_BASE_URL = res.tts.OPENAI_API_BASE_URL; TTS_OPENAI_API_KEY = res.tts.OPENAI_API_KEY; + TTS_API_KEY = res.tts.TTS_API_KEY; TTS_ENGINE = res.tts.ENGINE; TTS_MODEL = res.tts.MODEL; @@ -190,11 +193,13 @@ } else { getWebAPIVoices(); TTS_VOICE = ''; + TTS_MODEL = ''; } }} > + @@ -212,6 +217,17 @@ + {:else if TTS_ENGINE === 'elevenlabs'} +
+
+ +
+
{/if}
@@ -278,6 +294,47 @@ + {:else if TTS_ENGINE === 'elevenlabs'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + +
+
+
+
+
{$i18n.t('TTS Model')}
+
+
+ + + + {#each models as model} + +
+
+
+
{/if}