diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 54b5e7d79..bf6ff15e6 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -309,20 +309,33 @@ async def speech(request: Request, user=Depends(get_verified_user)): log.exception(e) raise HTTPException(status_code=400, detail="Invalid JSON payload") - import azure.cognitiveservices.speech as speechsdk + region = "uksouth" + language = "en-GB-SoniaNeural" + locale = "en-GB" + output_format = "audio-24khz-160kbitrate-mono-mp3" + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" - config = speechsdk.SpeechConfig(subscription=app.state.config.TTS_API_KEY, region="uksouth") - speaker_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False, filename=str(file_path)) + headers = { + 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY, + 'Content-Type': 'application/ssml+xml', + 'X-Microsoft-OutputFormat': output_format + } - client = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=speaker_config) - result = client.speak_text(payload["input"]) + data = f""" + {payload["input"]} + """ - if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + response = requests.post(url, headers=headers, data=data) + + if response.status_code == 200: + with open(file_path, "wb") as f: + f.write(response.content) return FileResponse(file_path) else: + log.error(f"Error synthesizing speech - {response.reason}") raise HTTPException( status_code=500, - detail=f"Error synthesizing speech - {result.reason}") + detail=f"Error synthesizing speech - {response.reason}") diff --git a/backend/requirements.txt b/backend/requirements.txt index 6fa289b0a..ba1252f56 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -79,8 +79,6 @@ extract_msg pydub duckduckgo-search~=6.2.11 -azure-cognitiveservices-speech==1.40.0 - ## Tests docker~=7.1.0 pytest~=8.3.2