feat: add FishSpeech TTS engine support

2025-04-02 12:09:06 +00:00 · 2025-03-05 23:20:08 +08:00 · 2025-03-05 23:20:08 +08:00 · 7e94acd36c
commit 7e94acd36c
parent 46a5fa1514
3 changed files with 122 additions and 2 deletions
--- a/backend/open_webui/routers/audio.py
+++ b/backend/open_webui/routers/audio.py
@ -464,7 +464,72 @@ async def speech(request: Request, user=Depends(get_verified_user)):
            await f.write(json.dumps(payload))

        return FileResponse(file_path)
+    
+    elif request.app.state.config.TTS_ENGINE == "fishspeech":
+        try:
+            timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT)
+            async with aiohttp.ClientSession(
+                timeout=timeout, trust_env=True
+            ) as session:
+                async with session.post(
+                    url=f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/tts",
+                    json={
+                        "text": payload["input"],
+                        "chunk_length": 200,
+                        "format": "mp3",
+                        "references": [],
+                        "reference_id": request.app.state.config.TTS_VOICE,
+                        "seed": None,
+                        "use_memory_cache": "on",
+                        "normalize": True,
+                        "streaming": False,
+                        "max_new_tokens": 1024,
+                        "top_p": 0.7,
+                        "repetition_penalty": 1.2,
+                        "temperature": 0.7
+                    },
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {request.app.state.config.TTS_API_KEY}",
+                        **(
+                            {
+                                "X-OpenWebUI-User-Name": user.name,
+                                "X-OpenWebUI-User-Id": user.id,
+                                "X-OpenWebUI-User-Email": user.email,
+                                "X-OpenWebUI-User-Role": user.role,
+                            }
+                            if ENABLE_FORWARD_USER_INFO_HEADERS
+                            else {}
+                        ),
+                    },
+                ) as r:
+                    r.raise_for_status()

+                    async with aiofiles.open(file_path, "wb") as f:
+                        await f.write(await r.read())
+
+                    async with aiofiles.open(file_body_path, "w") as f:
+                        await f.write(json.dumps(payload))
+            
+            return FileResponse(file_path)
+        
+        except Exception as e:
+            log.exception(e)
+            detail = None
+
+            try:
+                if r.status != 200:
+                    res = await r.json()
+
+                    if "error" in res:
+                        detail = f"External: {res['error'].get('message', '')}"
+            except Exception:
+                detail = f"External: {e}"
+
+            raise HTTPException(
+                status_code=getattr(r, "status", 500),
+                detail=detail if detail else "Open WebUI: Server Connection Error",
+            )

 def transcribe(request: Request, file_path):
    log.info(f"transcribe: {file_path}")
--- a/src/lib/components/admin/Settings/Audio.svelte
+++ b/src/lib/components/admin/Settings/Audio.svelte
@ -364,6 +364,7 @@
 							<option value="openai">{$i18n.t('OpenAI')}</option>
 							<option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
 							<option value="azure">{$i18n.t('Azure AI Speech')}</option>
+							<option value="fishspeech">{$i18n.t('FishSpeech')}</option>
 						</select>
 					</div>
 				</div>
@ -409,6 +410,20 @@
 							/>
 						</div>
 					</div>
+					{:else if TTS_ENGINE === 'fishspeech'}
+					<div>
+						<div class="mt-1 flex gap-2 mb-1">
+							<input
+								class="flex-1 w-full bg-transparent outline-hidden"
+								placeholder={$i18n.t('API Base URL')}
+								bind:value={TTS_OPENAI_API_BASE_URL}
+								required
+								title="fishspeech api base url like https://api.fish.audio/v1"
+							/>
+
+							<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_API_KEY} />
+						</div>
+					</div>
 				{/if}

 				<hr class="border-gray-100 dark:border-gray-850 my-2" />
@ -601,6 +616,45 @@
 							</div>
 						</div>
 					</div>
+					{:else if TTS_ENGINE === 'fishspeech'}
+					<div class=" flex gap-2">
+						<div class="w-full">
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										list="voice-list"
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+										bind:value={TTS_VOICE}
+										placeholder="Input fishspeech reference voice id"
+									/>
+								</div>
+							</div>
+							<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
+								The usage of reference voice id can be found in the fishspeech documentation.,
+								<a
+									class=" hover:underline dark:text-gray-200 text-gray-800"
+									href="https://docs.fish.audio/api-reference/endpoint/openapi-v1/text-to-speech#body-reference-id"
+									target="_blank"
+								>
+									{$i18n.t(`click here`)}.
+								</a>
+
+								To learn more about fishspeech,
+	
+								<a
+									class=" hover:underline dark:text-gray-200 text-gray-800"
+									href="https://fish.audio/"
+									target="_blank"
+								>
+									{$i18n.t(`click here`, {
+										name: 'fishspeech'
+									})}.
+								</a>
+
+							</div>
+						</div>
+					</div>
 				{/if}

 				<hr class="border-gray-100 dark:border-gray-850 my-2" />
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@ -738,11 +738,12 @@ export const extractSentencesForAudio = (text: string) => {
 		if (lastIndex >= 0) {
 			const previousText = mergedTexts[lastIndex];
 			let wordCount = previousText.split(/\s+/).length;
-			const charCount = previousText.length;
+			let charCount = previousText.length;

 			const isCJK = /[\u4e00-\u9fa5\u3040-\u30ff\u31f0-\u31ff\u3400-\u4dbf\u4e00-\u9fff\uF900-\uFAFF]/.test(previousText);
 			if (isCJK) {
-				wordCount = charCount * 3;
+				wordCount = charCount;
+				charCount = charCount * 10;
 			}
 			if (wordCount < 4 || charCount < 50) {
 				mergedTexts[lastIndex] = previousText + ' ' + currentText;