mirror of
https://github.com/open-webui/open-webui
synced 2025-03-23 22:31:38 +00:00
Merge pull request #5496 from pawel-ochman/azure-tts
Integrate Azure Speech service for TTS
This commit is contained in:
commit
5d3a89dd25
@ -19,6 +19,8 @@ from open_webui.config import (
|
|||||||
AUDIO_TTS_OPENAI_API_KEY,
|
AUDIO_TTS_OPENAI_API_KEY,
|
||||||
AUDIO_TTS_SPLIT_ON,
|
AUDIO_TTS_SPLIT_ON,
|
||||||
AUDIO_TTS_VOICE,
|
AUDIO_TTS_VOICE,
|
||||||
|
AUDIO_TTS_AZURE_SPEECH_REGION,
|
||||||
|
AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
|
||||||
CACHE_DIR,
|
CACHE_DIR,
|
||||||
CORS_ALLOW_ORIGIN,
|
CORS_ALLOW_ORIGIN,
|
||||||
WHISPER_MODEL,
|
WHISPER_MODEL,
|
||||||
@ -62,6 +64,9 @@ app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
|
|||||||
app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
|
app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
|
||||||
app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
|
app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
|
||||||
|
|
||||||
|
app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION
|
||||||
|
app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT
|
||||||
|
|
||||||
# setting device type for whisper model
|
# setting device type for whisper model
|
||||||
whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
|
whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
|
||||||
log.info(f"whisper_device_type: {whisper_device_type}")
|
log.info(f"whisper_device_type: {whisper_device_type}")
|
||||||
@ -78,6 +83,8 @@ class TTSConfigForm(BaseModel):
|
|||||||
MODEL: str
|
MODEL: str
|
||||||
VOICE: str
|
VOICE: str
|
||||||
SPLIT_ON: str
|
SPLIT_ON: str
|
||||||
|
AZURE_SPEECH_REGION: str
|
||||||
|
AZURE_SPEECH_OUTPUT_FORMAT: str
|
||||||
|
|
||||||
|
|
||||||
class STTConfigForm(BaseModel):
|
class STTConfigForm(BaseModel):
|
||||||
@ -130,6 +137,8 @@ async def get_audio_config(user=Depends(get_admin_user)):
|
|||||||
"MODEL": app.state.config.TTS_MODEL,
|
"MODEL": app.state.config.TTS_MODEL,
|
||||||
"VOICE": app.state.config.TTS_VOICE,
|
"VOICE": app.state.config.TTS_VOICE,
|
||||||
"SPLIT_ON": app.state.config.TTS_SPLIT_ON,
|
"SPLIT_ON": app.state.config.TTS_SPLIT_ON,
|
||||||
|
"AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION,
|
||||||
|
"AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
|
||||||
},
|
},
|
||||||
"stt": {
|
"stt": {
|
||||||
"OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
|
"OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
|
||||||
@ -151,6 +160,10 @@ async def update_audio_config(
|
|||||||
app.state.config.TTS_MODEL = form_data.tts.MODEL
|
app.state.config.TTS_MODEL = form_data.tts.MODEL
|
||||||
app.state.config.TTS_VOICE = form_data.tts.VOICE
|
app.state.config.TTS_VOICE = form_data.tts.VOICE
|
||||||
app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
|
app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
|
||||||
|
app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION
|
||||||
|
app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = (
|
||||||
|
form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT
|
||||||
|
)
|
||||||
|
|
||||||
app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
|
app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
|
||||||
app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
|
app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
|
||||||
@ -166,6 +179,8 @@ async def update_audio_config(
|
|||||||
"MODEL": app.state.config.TTS_MODEL,
|
"MODEL": app.state.config.TTS_MODEL,
|
||||||
"VOICE": app.state.config.TTS_VOICE,
|
"VOICE": app.state.config.TTS_VOICE,
|
||||||
"SPLIT_ON": app.state.config.TTS_SPLIT_ON,
|
"SPLIT_ON": app.state.config.TTS_SPLIT_ON,
|
||||||
|
"AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION,
|
||||||
|
"AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
|
||||||
},
|
},
|
||||||
"stt": {
|
"stt": {
|
||||||
"OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
|
"OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
|
||||||
@ -301,6 +316,42 @@ async def speech(request: Request, user=Depends(get_verified_user)):
|
|||||||
detail=error_detail,
|
detail=error_detail,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif app.state.config.TTS_ENGINE == "azure":
|
||||||
|
payload = None
|
||||||
|
try:
|
||||||
|
payload = json.loads(body.decode("utf-8"))
|
||||||
|
except Exception as e:
|
||||||
|
log.exception(e)
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid JSON payload")
|
||||||
|
|
||||||
|
region = app.state.config.TTS_AZURE_SPEECH_REGION
|
||||||
|
language = app.state.config.TTS_VOICE
|
||||||
|
locale = "-".join(app.state.config.TTS_VOICE.split("-")[:1])
|
||||||
|
output_format = app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT
|
||||||
|
url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY,
|
||||||
|
"Content-Type": "application/ssml+xml",
|
||||||
|
"X-Microsoft-OutputFormat": output_format,
|
||||||
|
}
|
||||||
|
|
||||||
|
data = f"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{locale}">
|
||||||
|
<voice name="{language}">{payload["input"]}</voice>
|
||||||
|
</speak>"""
|
||||||
|
|
||||||
|
response = requests.post(url, headers=headers, data=data)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(response.content)
|
||||||
|
return FileResponse(file_path)
|
||||||
|
else:
|
||||||
|
log.error(f"Error synthesizing speech - {response.reason}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500, detail=f"Error synthesizing speech - {response.reason}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/transcriptions")
|
@app.post("/transcriptions")
|
||||||
def transcribe(
|
def transcribe(
|
||||||
@ -478,6 +529,21 @@ def get_available_voices() -> dict:
|
|||||||
except Exception:
|
except Exception:
|
||||||
# Avoided @lru_cache with exception
|
# Avoided @lru_cache with exception
|
||||||
pass
|
pass
|
||||||
|
elif app.state.config.TTS_ENGINE == "azure":
|
||||||
|
try:
|
||||||
|
region = app.state.config.TTS_AZURE_SPEECH_REGION
|
||||||
|
url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list"
|
||||||
|
headers = {"Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
voices = response.json()
|
||||||
|
for voice in voices:
|
||||||
|
ret[voice["ShortName"]] = (
|
||||||
|
f"{voice['DisplayName']} ({voice['ShortName']})"
|
||||||
|
)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
log.error(f"Error fetching voices: {str(e)}")
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
@ -1472,3 +1472,17 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig(
|
|||||||
"audio.tts.split_on",
|
"audio.tts.split_on",
|
||||||
os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"),
|
os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig(
|
||||||
|
"AUDIO_TTS_AZURE_SPEECH_REGION",
|
||||||
|
"audio.tts.azure.speech_region",
|
||||||
|
os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"),
|
||||||
|
)
|
||||||
|
|
||||||
|
AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig(
|
||||||
|
"AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT",
|
||||||
|
"audio.tts.azure.speech_output_format",
|
||||||
|
os.getenv(
|
||||||
|
"AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", "audio-24khz-160kbitrate-mono-mp3"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
@ -31,6 +31,8 @@
|
|||||||
let TTS_MODEL = '';
|
let TTS_MODEL = '';
|
||||||
let TTS_VOICE = '';
|
let TTS_VOICE = '';
|
||||||
let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
|
let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
|
||||||
|
let TTS_AZURE_SPEECH_REGION = '';
|
||||||
|
let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
|
||||||
|
|
||||||
let STT_OPENAI_API_BASE_URL = '';
|
let STT_OPENAI_API_BASE_URL = '';
|
||||||
let STT_OPENAI_API_KEY = '';
|
let STT_OPENAI_API_KEY = '';
|
||||||
@ -87,7 +89,9 @@
|
|||||||
ENGINE: TTS_ENGINE,
|
ENGINE: TTS_ENGINE,
|
||||||
MODEL: TTS_MODEL,
|
MODEL: TTS_MODEL,
|
||||||
VOICE: TTS_VOICE,
|
VOICE: TTS_VOICE,
|
||||||
SPLIT_ON: TTS_SPLIT_ON
|
SPLIT_ON: TTS_SPLIT_ON,
|
||||||
|
AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
|
||||||
|
AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
|
||||||
},
|
},
|
||||||
stt: {
|
stt: {
|
||||||
OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
|
OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
|
||||||
@ -120,6 +124,9 @@
|
|||||||
|
|
||||||
TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
|
TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
|
||||||
|
|
||||||
|
TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
|
||||||
|
TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
|
||||||
|
|
||||||
STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
|
STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
|
||||||
STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
|
STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
|
||||||
|
|
||||||
@ -224,6 +231,7 @@
|
|||||||
<option value="">{$i18n.t('Web API')}</option>
|
<option value="">{$i18n.t('Web API')}</option>
|
||||||
<option value="openai">{$i18n.t('OpenAI')}</option>
|
<option value="openai">{$i18n.t('OpenAI')}</option>
|
||||||
<option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
|
<option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
|
||||||
|
<option value="azure">{$i18n.t('Azure AI Speech')}</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -252,6 +260,23 @@
|
|||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{:else if TTS_ENGINE === 'azure'}
|
||||||
|
<div>
|
||||||
|
<div class="mt-1 flex gap-2 mb-1">
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
|
||||||
|
placeholder={$i18n.t('API Key')}
|
||||||
|
bind:value={TTS_API_KEY}
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
|
||||||
|
placeholder={$i18n.t('Azure Region')}
|
||||||
|
bind:value={TTS_AZURE_SPEECH_REGION}
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
|
|
||||||
<hr class=" dark:border-gray-850 my-2" />
|
<hr class=" dark:border-gray-850 my-2" />
|
||||||
@ -359,6 +384,49 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{:else if TTS_ENGINE === 'azure'}
|
||||||
|
<div class=" flex gap-2">
|
||||||
|
<div class="w-full">
|
||||||
|
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
|
||||||
|
<div class="flex w-full">
|
||||||
|
<div class="flex-1">
|
||||||
|
<input
|
||||||
|
list="voice-list"
|
||||||
|
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
|
||||||
|
bind:value={TTS_VOICE}
|
||||||
|
placeholder="Select a voice"
|
||||||
|
/>
|
||||||
|
|
||||||
|
<datalist id="voice-list">
|
||||||
|
{#each voices as voice}
|
||||||
|
<option value={voice.id}>{voice.name}</option>
|
||||||
|
{/each}
|
||||||
|
</datalist>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="w-full">
|
||||||
|
<div class=" mb-1.5 text-sm font-medium">
|
||||||
|
{$i18n.t('Output format')}
|
||||||
|
<a
|
||||||
|
href="https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs"
|
||||||
|
target="_blank"
|
||||||
|
>
|
||||||
|
<small>{$i18n.t('Available list')}</small>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<div class="flex w-full">
|
||||||
|
<div class="flex-1">
|
||||||
|
<input
|
||||||
|
list="tts-model-list"
|
||||||
|
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
|
||||||
|
bind:value={TTS_AZURE_SPEECH_OUTPUT_FORMAT}
|
||||||
|
placeholder="Select a output format"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
|
|
||||||
<hr class="dark:border-gray-850 my-2" />
|
<hr class="dark:border-gray-850 my-2" />
|
||||||
|
Loading…
Reference in New Issue
Block a user