feat: add FishSpeech TTS engine support

This commit is contained in:
georgechen 2025-03-05 23:20:08 +08:00
parent 46a5fa1514
commit 7e94acd36c
3 changed files with 122 additions and 2 deletions

View File

@ -464,7 +464,72 @@ async def speech(request: Request, user=Depends(get_verified_user)):
await f.write(json.dumps(payload))
return FileResponse(file_path)
elif request.app.state.config.TTS_ENGINE == "fishspeech":
try:
timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT)
async with aiohttp.ClientSession(
timeout=timeout, trust_env=True
) as session:
async with session.post(
url=f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/tts",
json={
"text": payload["input"],
"chunk_length": 200,
"format": "mp3",
"references": [],
"reference_id": request.app.state.config.TTS_VOICE,
"seed": None,
"use_memory_cache": "on",
"normalize": True,
"streaming": False,
"max_new_tokens": 1024,
"top_p": 0.7,
"repetition_penalty": 1.2,
"temperature": 0.7
},
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {request.app.state.config.TTS_API_KEY}",
**(
{
"X-OpenWebUI-User-Name": user.name,
"X-OpenWebUI-User-Id": user.id,
"X-OpenWebUI-User-Email": user.email,
"X-OpenWebUI-User-Role": user.role,
}
if ENABLE_FORWARD_USER_INFO_HEADERS
else {}
),
},
) as r:
r.raise_for_status()
async with aiofiles.open(file_path, "wb") as f:
await f.write(await r.read())
async with aiofiles.open(file_body_path, "w") as f:
await f.write(json.dumps(payload))
return FileResponse(file_path)
except Exception as e:
log.exception(e)
detail = None
try:
if r.status != 200:
res = await r.json()
if "error" in res:
detail = f"External: {res['error'].get('message', '')}"
except Exception:
detail = f"External: {e}"
raise HTTPException(
status_code=getattr(r, "status", 500),
detail=detail if detail else "Open WebUI: Server Connection Error",
)
def transcribe(request: Request, file_path):
log.info(f"transcribe: {file_path}")

View File

@ -364,6 +364,7 @@
<option value="openai">{$i18n.t('OpenAI')}</option>
<option value="elevenlabs">{$i18n.t('ElevenLabs')}</option>
<option value="azure">{$i18n.t('Azure AI Speech')}</option>
<option value="fishspeech">{$i18n.t('FishSpeech')}</option>
</select>
</div>
</div>
@ -409,6 +410,20 @@
/>
</div>
</div>
{:else if TTS_ENGINE === 'fishspeech'}
<div>
<div class="mt-1 flex gap-2 mb-1">
<input
class="flex-1 w-full bg-transparent outline-hidden"
placeholder={$i18n.t('API Base URL')}
bind:value={TTS_OPENAI_API_BASE_URL}
required
title="fishspeech api base url like https://api.fish.audio/v1"
/>
<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_API_KEY} />
</div>
</div>
{/if}
<hr class="border-gray-100 dark:border-gray-850 my-2" />
@ -601,6 +616,45 @@
</div>
</div>
</div>
{:else if TTS_ENGINE === 'fishspeech'}
<div class=" flex gap-2">
<div class="w-full">
<div class=" mb-1.5 text-sm font-medium">{$i18n.t('TTS Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
bind:value={TTS_VOICE}
placeholder="Input fishspeech reference voice id"
/>
</div>
</div>
<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
The usage of reference voice id can be found in the fishspeech documentation.,
<a
class=" hover:underline dark:text-gray-200 text-gray-800"
href="https://docs.fish.audio/api-reference/endpoint/openapi-v1/text-to-speech#body-reference-id"
target="_blank"
>
{$i18n.t(`click here`)}.
</a>
To learn more about fishspeech,
<a
class=" hover:underline dark:text-gray-200 text-gray-800"
href="https://fish.audio/"
target="_blank"
>
{$i18n.t(`click here`, {
name: 'fishspeech'
})}.
</a>
</div>
</div>
</div>
{/if}
<hr class="border-gray-100 dark:border-gray-850 my-2" />

View File

@ -738,11 +738,12 @@ export const extractSentencesForAudio = (text: string) => {
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
let wordCount = previousText.split(/\s+/).length;
const charCount = previousText.length;
let charCount = previousText.length;
const isCJK = /[\u4e00-\u9fa5\u3040-\u30ff\u31f0-\u31ff\u3400-\u4dbf\u4e00-\u9fff\uF900-\uFAFF]/.test(previousText);
if (isCJK) {
wordCount = charCount * 3;
wordCount = charCount;
charCount = charCount * 10;
}
if (wordCount < 4 || charCount < 50) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;