mirror of
				https://github.com/open-webui/open-webui
				synced 2025-06-26 18:26:48 +00:00 
			
		
		
		
	Merge pull request #13540 from NoMoreFood/dev
feat: Azure TTS Allow Base URL
This commit is contained in:
		
						commit
						2a4dfc02a2
					
				@ -2689,7 +2689,7 @@ AUDIO_STT_AZURE_BASE_URL = PersistentConfig(
 | 
			
		||||
AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig(
 | 
			
		||||
    "AUDIO_STT_AZURE_MAX_SPEAKERS",
 | 
			
		||||
    "audio.stt.azure.max_speakers",
 | 
			
		||||
    os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", "3"),
 | 
			
		||||
    os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", ""),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
 | 
			
		||||
@ -2737,7 +2737,13 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig(
 | 
			
		||||
AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig(
 | 
			
		||||
    "AUDIO_TTS_AZURE_SPEECH_REGION",
 | 
			
		||||
    "audio.tts.azure.speech_region",
 | 
			
		||||
    os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"),
 | 
			
		||||
    os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", ""),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
AUDIO_TTS_AZURE_SPEECH_BASE_URL = PersistentConfig(
 | 
			
		||||
    "AUDIO_TTS_AZURE_SPEECH_BASE_URL",
 | 
			
		||||
    "audio.tts.azure.speech_base_url",
 | 
			
		||||
    os.getenv("AUDIO_TTS_AZURE_SPEECH_BASE_URL", ""),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig(
 | 
			
		||||
 | 
			
		||||
@ -166,6 +166,7 @@ from open_webui.config import (
 | 
			
		||||
    AUDIO_TTS_SPLIT_ON,
 | 
			
		||||
    AUDIO_TTS_VOICE,
 | 
			
		||||
    AUDIO_TTS_AZURE_SPEECH_REGION,
 | 
			
		||||
    AUDIO_TTS_AZURE_SPEECH_BASE_URL,
 | 
			
		||||
    AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
 | 
			
		||||
    PLAYWRIGHT_WS_URL,
 | 
			
		||||
    PLAYWRIGHT_TIMEOUT,
 | 
			
		||||
@ -852,6 +853,7 @@ app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION
 | 
			
		||||
app.state.config.TTS_AZURE_SPEECH_BASE_URL = AUDIO_TTS_AZURE_SPEECH_BASE_URL
 | 
			
		||||
app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -138,6 +138,7 @@ class TTSConfigForm(BaseModel):
 | 
			
		||||
    VOICE: str
 | 
			
		||||
    SPLIT_ON: str
 | 
			
		||||
    AZURE_SPEECH_REGION: str
 | 
			
		||||
    AZURE_SPEECH_BASE_URL: str
 | 
			
		||||
    AZURE_SPEECH_OUTPUT_FORMAT: str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -172,6 +173,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
 | 
			
		||||
            "VOICE": request.app.state.config.TTS_VOICE,
 | 
			
		||||
            "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
 | 
			
		||||
            "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
 | 
			
		||||
            "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
 | 
			
		||||
            "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
 | 
			
		||||
        },
 | 
			
		||||
        "stt": {
 | 
			
		||||
@ -202,6 +204,9 @@ async def update_audio_config(
 | 
			
		||||
    request.app.state.config.TTS_VOICE = form_data.tts.VOICE
 | 
			
		||||
    request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
 | 
			
		||||
    request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION
 | 
			
		||||
    request.app.state.config.TTS_AZURE_SPEECH_BASE_URL = (
 | 
			
		||||
        form_data.tts.AZURE_SPEECH_BASE_URL
 | 
			
		||||
    )
 | 
			
		||||
    request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = (
 | 
			
		||||
        form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT
 | 
			
		||||
    )
 | 
			
		||||
@ -235,6 +240,7 @@ async def update_audio_config(
 | 
			
		||||
            "VOICE": request.app.state.config.TTS_VOICE,
 | 
			
		||||
            "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
 | 
			
		||||
            "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
 | 
			
		||||
            "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
 | 
			
		||||
            "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
 | 
			
		||||
        },
 | 
			
		||||
        "stt": {
 | 
			
		||||
@ -406,7 +412,8 @@ async def speech(request: Request, user=Depends(get_verified_user)):
 | 
			
		||||
            log.exception(e)
 | 
			
		||||
            raise HTTPException(status_code=400, detail="Invalid JSON payload")
 | 
			
		||||
 | 
			
		||||
        region = request.app.state.config.TTS_AZURE_SPEECH_REGION
 | 
			
		||||
        region = request.app.state.config.TTS_AZURE_SPEECH_REGION or "eastus"
 | 
			
		||||
        base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
 | 
			
		||||
        language = request.app.state.config.TTS_VOICE
 | 
			
		||||
        locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1])
 | 
			
		||||
        output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT
 | 
			
		||||
@ -420,7 +427,8 @@ async def speech(request: Request, user=Depends(get_verified_user)):
 | 
			
		||||
                timeout=timeout, trust_env=True
 | 
			
		||||
            ) as session:
 | 
			
		||||
                async with session.post(
 | 
			
		||||
                    f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1",
 | 
			
		||||
                    (base_url or f"https://{region}.tts.speech.microsoft.com")
 | 
			
		||||
                    + "/cognitiveservices/v1",
 | 
			
		||||
                    headers={
 | 
			
		||||
                        "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY,
 | 
			
		||||
                        "Content-Type": "application/ssml+xml",
 | 
			
		||||
@ -651,10 +659,10 @@ def transcribe(request: Request, file_path):
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
 | 
			
		||||
        region = request.app.state.config.AUDIO_STT_AZURE_REGION
 | 
			
		||||
        region = request.app.state.config.AUDIO_STT_AZURE_REGION or "eastus"
 | 
			
		||||
        locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
 | 
			
		||||
        base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
 | 
			
		||||
        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS
 | 
			
		||||
        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS or 3
 | 
			
		||||
 | 
			
		||||
        # IF NO LOCALES, USE DEFAULTS
 | 
			
		||||
        if len(locales) < 2:
 | 
			
		||||
@ -681,12 +689,6 @@ def transcribe(request: Request, file_path):
 | 
			
		||||
                detail="Azure API key is required for Azure STT",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if not base_url and not region:
 | 
			
		||||
            raise HTTPException(
 | 
			
		||||
                status_code=400,
 | 
			
		||||
                detail="Azure region or base url is required for Azure STT",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        r = None
 | 
			
		||||
        try:
 | 
			
		||||
            # Prepare the request
 | 
			
		||||
@ -702,9 +704,8 @@ def transcribe(request: Request, file_path):
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            url = (
 | 
			
		||||
                base_url
 | 
			
		||||
                or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
 | 
			
		||||
            )
 | 
			
		||||
                base_url or f"https://{region}.api.cognitive.microsoft.com"
 | 
			
		||||
            ) + "/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
 | 
			
		||||
 | 
			
		||||
            # Use context manager to ensure file is properly closed
 | 
			
		||||
            with open(file_path, "rb") as audio_file:
 | 
			
		||||
@ -939,7 +940,10 @@ def get_available_voices(request) -> dict:
 | 
			
		||||
    elif request.app.state.config.TTS_ENGINE == "azure":
 | 
			
		||||
        try:
 | 
			
		||||
            region = request.app.state.config.TTS_AZURE_SPEECH_REGION
 | 
			
		||||
            url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list"
 | 
			
		||||
            base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
 | 
			
		||||
            url = (
 | 
			
		||||
                base_url or f"https://{region}.tts.speech.microsoft.com"
 | 
			
		||||
            ) + "/cognitiveservices/voices/list"
 | 
			
		||||
            headers = {
 | 
			
		||||
                "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
@ -32,6 +32,7 @@
 | 
			
		||||
	let TTS_VOICE = '';
 | 
			
		||||
	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
 | 
			
		||||
	let TTS_AZURE_SPEECH_REGION = '';
 | 
			
		||||
	let TTS_AZURE_SPEECH_BASE_URL = '';
 | 
			
		||||
	let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
 | 
			
		||||
 | 
			
		||||
	let STT_OPENAI_API_BASE_URL = '';
 | 
			
		||||
@ -105,6 +106,7 @@
 | 
			
		||||
				VOICE: TTS_VOICE,
 | 
			
		||||
				SPLIT_ON: TTS_SPLIT_ON,
 | 
			
		||||
				AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
 | 
			
		||||
				AZURE_SPEECH_BASE_URL: TTS_AZURE_SPEECH_BASE_URL,
 | 
			
		||||
				AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
 | 
			
		||||
			},
 | 
			
		||||
			stt: {
 | 
			
		||||
@ -149,8 +151,9 @@
 | 
			
		||||
 | 
			
		||||
			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
 | 
			
		||||
 | 
			
		||||
			TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
 | 
			
		||||
			TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
 | 
			
		||||
			TTS_AZURE_SPEECH_BASE_URL = res.tts.AZURE_SPEECH_BASE_URL;
 | 
			
		||||
			TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
 | 
			
		||||
 | 
			
		||||
			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
 | 
			
		||||
			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
 | 
			
		||||
@ -272,16 +275,23 @@
 | 
			
		||||
								bind:value={STT_AZURE_API_KEY}
 | 
			
		||||
								required
 | 
			
		||||
							/>
 | 
			
		||||
							<input
 | 
			
		||||
								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
								placeholder={$i18n.t('Azure Region')}
 | 
			
		||||
								bind:value={STT_AZURE_REGION}
 | 
			
		||||
								required
 | 
			
		||||
							/>
 | 
			
		||||
						</div>
 | 
			
		||||
 | 
			
		||||
						<hr class="border-gray-100 dark:border-gray-850 my-2" />
 | 
			
		||||
 | 
			
		||||
						<div>
 | 
			
		||||
							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
 | 
			
		||||
							<div class="flex w-full">
 | 
			
		||||
								<div class="flex-1">
 | 
			
		||||
									<input
 | 
			
		||||
										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
										bind:value={STT_AZURE_REGION}
 | 
			
		||||
										placeholder={$i18n.t('e.g., westus (leave blank for eastus)')}
 | 
			
		||||
									/>
 | 
			
		||||
								</div>
 | 
			
		||||
							</div>
 | 
			
		||||
						</div>
 | 
			
		||||
 | 
			
		||||
						<div>
 | 
			
		||||
							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
 | 
			
		||||
							<div class="flex w-full">
 | 
			
		||||
@ -296,13 +306,13 @@
 | 
			
		||||
						</div>
 | 
			
		||||
 | 
			
		||||
						<div>
 | 
			
		||||
							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Base URL')}</div>
 | 
			
		||||
							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
 | 
			
		||||
							<div class="flex w-full">
 | 
			
		||||
								<div class="flex-1">
 | 
			
		||||
									<input
 | 
			
		||||
										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
										bind:value={STT_AZURE_BASE_URL}
 | 
			
		||||
										placeholder={$i18n.t('(leave blank for Azure Commercial URL auto-generation)')}
 | 
			
		||||
										placeholder={$i18n.t('(leave blank for to use commercial endpoint)')}
 | 
			
		||||
									/>
 | 
			
		||||
								</div>
 | 
			
		||||
							</div>
 | 
			
		||||
@ -468,18 +478,35 @@
 | 
			
		||||
				{:else if TTS_ENGINE === 'azure'}
 | 
			
		||||
					<div>
 | 
			
		||||
						<div class="mt-1 flex gap-2 mb-1">
 | 
			
		||||
							<input
 | 
			
		||||
								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
								placeholder={$i18n.t('API Key')}
 | 
			
		||||
								bind:value={TTS_API_KEY}
 | 
			
		||||
								required
 | 
			
		||||
							/>
 | 
			
		||||
							<input
 | 
			
		||||
								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
								placeholder={$i18n.t('Azure Region')}
 | 
			
		||||
								bind:value={TTS_AZURE_SPEECH_REGION}
 | 
			
		||||
								required
 | 
			
		||||
							/>
 | 
			
		||||
							<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_API_KEY} required />
 | 
			
		||||
						</div>
 | 
			
		||||
 | 
			
		||||
						<hr class="border-gray-100 dark:border-gray-850 my-2" />
 | 
			
		||||
 | 
			
		||||
						<div>
 | 
			
		||||
							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
 | 
			
		||||
							<div class="flex w-full">
 | 
			
		||||
								<div class="flex-1">
 | 
			
		||||
									<input
 | 
			
		||||
										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
										bind:value={TTS_AZURE_SPEECH_REGION}
 | 
			
		||||
										placeholder={$i18n.t('e.g., westus (leave blank for eastus)')}
 | 
			
		||||
									/>
 | 
			
		||||
								</div>
 | 
			
		||||
							</div>
 | 
			
		||||
						</div>
 | 
			
		||||
 | 
			
		||||
						<div>
 | 
			
		||||
							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
 | 
			
		||||
							<div class="flex w-full">
 | 
			
		||||
								<div class="flex-1">
 | 
			
		||||
									<input
 | 
			
		||||
										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 | 
			
		||||
										bind:value={TTS_AZURE_SPEECH_BASE_URL}
 | 
			
		||||
										placeholder={$i18n.t('(leave blank for to use commercial endpoint)')}
 | 
			
		||||
									/>
 | 
			
		||||
								</div>
 | 
			
		||||
							</div>
 | 
			
		||||
						</div>
 | 
			
		||||
					</div>
 | 
			
		||||
				{/if}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user