feat: Kokoro-js TTS support

This commit is contained in:
Timothy Jaeryang Baek
2025-02-09 23:42:27 -08:00
parent a22d1d5410
commit 205ce635f6
5 changed files with 388 additions and 78 deletions

View File

@@ -4,12 +4,18 @@
import { createEventDispatcher } from 'svelte';
import { onMount, tick, getContext } from 'svelte';
import type { Writable } from 'svelte/store';
import type { i18n as i18nType } from 'i18next';
const i18n = getContext<Writable<i18nType>>('i18n');
const dispatch = createEventDispatcher();
import { config, models, settings, user } from '$lib/stores';
import { createNewFeedback, getFeedbackById, updateFeedbackById } from '$lib/apis/evaluations';
import { getChatById } from '$lib/apis/chats';
import { generateTags } from '$lib/apis';
import { config, models, settings, TTSWorker, user } from '$lib/stores';
import { synthesizeOpenAISpeech } from '$lib/apis/audio';
import { imageGenerations } from '$lib/apis/images';
import {
@@ -34,13 +40,8 @@
import Error from './Error.svelte';
import Citations from './Citations.svelte';
import CodeExecutions from './CodeExecutions.svelte';
import type { Writable } from 'svelte/store';
import type { i18n as i18nType } from 'i18next';
import ContentRenderer from './ContentRenderer.svelte';
import { createNewFeedback, getFeedbackById, updateFeedbackById } from '$lib/apis/evaluations';
import { getChatById } from '$lib/apis/chats';
import { generateTags } from '$lib/apis';
import { KokoroWorker } from '$lib/workers/KokoroWorker';
interface MessageType {
id: string;
@@ -193,62 +194,7 @@
speaking = true;
if ($config.audio.tts.engine !== '') {
loadingSpeech = true;
const messageContentParts: string[] = getMessageContentParts(
message.content,
$config?.audio?.tts?.split_on ?? 'punctuation'
);
if (!messageContentParts.length) {
console.log('No content to speak');
toast.info($i18n.t('No content to speak'));
speaking = false;
loadingSpeech = false;
return;
}
console.debug('Prepared message content for TTS', messageContentParts);
audioParts = messageContentParts.reduce(
(acc, _sentence, idx) => {
acc[idx] = null;
return acc;
},
{} as typeof audioParts
);
let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
for (const [idx, sentence] of messageContentParts.entries()) {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
: $config?.audio?.tts?.voice,
sentence
).catch((error) => {
console.error(error);
toast.error(`${error}`);
speaking = false;
loadingSpeech = false;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
audio.playbackRate = $settings.audio?.tts?.playbackRate ?? 1;
audioParts[idx] = audio;
loadingSpeech = false;
lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
}
}
} else {
if ($config.audio.tts.engine === '') {
let voices = [];
const getVoicesLoop = setInterval(() => {
voices = speechSynthesis.getVoices();
@@ -283,6 +229,99 @@
speechSynthesis.speak(speak);
}
}, 100);
} else {
loadingSpeech = true;
const messageContentParts: string[] = getMessageContentParts(
message.content,
$config?.audio?.tts?.split_on ?? 'punctuation'
);
if (!messageContentParts.length) {
console.log('No content to speak');
toast.info($i18n.t('No content to speak'));
speaking = false;
loadingSpeech = false;
return;
}
console.debug('Prepared message content for TTS', messageContentParts);
audioParts = messageContentParts.reduce(
(acc, _sentence, idx) => {
acc[idx] = null;
return acc;
},
{} as typeof audioParts
);
let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
if ($settings.audio?.tts?.engine === 'browser-kokoro') {
if (!$TTSWorker) {
await TTSWorker.set(
new KokoroWorker({
dtype: $settings.audio?.tts?.engineConfig?.dtype ?? 'fp32'
})
);
await $TTSWorker.init();
}
console.log($TTSWorker);
for (const [idx, sentence] of messageContentParts.entries()) {
const blob = await $TTSWorker
.generate({
text: sentence,
voice: $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice
})
.catch((error) => {
console.error(error);
toast.error(`${error}`);
speaking = false;
loadingSpeech = false;
});
if (blob) {
const audio = new Audio(blob);
audio.playbackRate = $settings.audio?.tts?.playbackRate ?? 1;
audioParts[idx] = audio;
loadingSpeech = false;
lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
}
}
} else {
for (const [idx, sentence] of messageContentParts.entries()) {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
: $config?.audio?.tts?.voice,
sentence
).catch((error) => {
console.error(error);
toast.error(`${error}`);
speaking = false;
loadingSpeech = false;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
audio.playbackRate = $settings.audio?.tts?.playbackRate ?? 1;
audioParts[idx] = audio;
loadingSpeech = false;
lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
}
}
}
}
};

View File

@@ -1,11 +1,14 @@
<script lang="ts">
import { toast } from 'svelte-sonner';
import { createEventDispatcher, onMount, getContext } from 'svelte';
import { KokoroTTS } from 'kokoro-js';
import { user, settings, config } from '$lib/stores';
import { getVoices as _getVoices } from '$lib/apis/audio';
import Switch from '$lib/components/common/Switch.svelte';
import { round } from '@huggingface/transformers';
import Spinner from '$lib/components/common/Spinner.svelte';
const dispatch = createEventDispatcher();
const i18n = getContext('i18n');
@@ -20,6 +23,13 @@
let STTEngine = '';
let TTSEngine = '';
let TTSEngineConfig = {};
let TTSModel = null;
let TTSModelProgress = null;
let TTSModelLoading = false;
let voices = [];
let voice = '';
@@ -28,23 +38,37 @@
const speedOptions = [2, 1.75, 1.5, 1.25, 1, 0.75, 0.5];
const getVoices = async () => {
if ($config.audio.tts.engine === '') {
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (TTSEngine === 'browser-kokoro') {
if (!TTSModel) {
await loadKokoro();
}
// do your loop
if (voices.length > 0) {
clearInterval(getVoicesLoop);
}
}, 100);
} else {
const res = await _getVoices(localStorage.token).catch((e) => {
toast.error(`${e}`);
voices = Object.entries(TTSModel.voices).map(([key, value]) => {
return {
id: key,
name: value.name,
localService: false
};
});
} else {
if ($config.audio.tts.engine === '') {
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (res) {
console.log(res);
voices = res.voices;
// do your loop
if (voices.length > 0) {
clearInterval(getVoicesLoop);
}
}, 100);
} else {
const res = await _getVoices(localStorage.token).catch((e) => {
toast.error(`${e}`);
});
if (res) {
console.log(res);
voices = res.voices;
}
}
}
};
@@ -67,6 +91,9 @@
STTEngine = $settings?.audio?.stt?.engine ?? '';
TTSEngine = $settings?.audio?.tts?.engine ?? '';
TTSEngineConfig = $settings?.audio?.tts?.engineConfig ?? {};
if ($settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice) {
voice = $settings?.audio?.tts?.voice ?? $config.audio.tts.voice ?? '';
} else {
@@ -77,6 +104,51 @@
await getVoices();
});
$: if (TTSEngine && TTSEngineConfig) {
onTTSEngineChange();
}
const onTTSEngineChange = async () => {
if (TTSEngine === 'browser-kokoro') {
await loadKokoro();
}
};
const loadKokoro = async () => {
if (TTSEngine === 'browser-kokoro') {
voices = [];
if (TTSEngineConfig?.dtype) {
TTSModel = null;
TTSModelProgress = null;
TTSModelLoading = true;
const model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX';
TTSModel = await KokoroTTS.from_pretrained(model_id, {
dtype: TTSEngineConfig.dtype, // Options: "fp32", "fp16", "q8", "q4", "q4f16"
device: !!navigator?.gpu ? 'webgpu' : 'wasm', // Detect WebGPU
progress_callback: (e) => {
TTSModelProgress = e;
console.log(e);
}
});
await getVoices();
// const rawAudio = await tts.generate(inputText, {
// // Use `tts.list_voices()` to list all available voices
// voice: voice
// });
// const blobUrl = URL.createObjectURL(await rawAudio.toBlob());
// const audio = new Audio(blobUrl);
// audio.play();
}
}
};
</script>
<form
@@ -88,6 +160,8 @@
engine: STTEngine !== '' ? STTEngine : undefined
},
tts: {
engine: TTSEngine !== '' ? TTSEngine : undefined,
engineConfig: TTSEngineConfig,
playbackRate: playbackRate,
voice: voice !== '' ? voice : undefined,
defaultVoice: $config?.audio?.tts?.voice ?? '',
@@ -142,6 +216,39 @@
<div>
<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={TTSEngine}
placeholder="Select an engine"
>
<option value="">{$i18n.t('Default')}</option>
<option value="browser-kokoro">{$i18n.t('Kokoro.js (Browser)')}</option>
</select>
</div>
</div>
{#if TTSEngine === 'browser-kokoro'}
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Kokoro.js Dtype')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={TTSEngineConfig.dtype}
placeholder="Select dtype"
>
<option value="" disabled selected>Select dtype</option>
<option value="fp32">fp32</option>
<option value="fp16">fp16</option>
<option value="q8">q8</option>
<option value="q4">q4</option>
</select>
</div>
</div>
{/if}
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Auto-playback response')}</div>
@@ -178,7 +285,46 @@
<hr class=" dark:border-gray-850" />
{#if $config.audio.tts.engine === ''}
{#if TTSEngine === 'browser-kokoro'}
{#if TTSModel}
<div>
<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
bind:value={voice}
placeholder="Select a voice"
/>
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
{:else}
<div>
<div class=" mb-2.5 text-sm font-medium flex gap-2 items-center">
<Spinner className="size-4" />
<div class=" text-sm font-medium shimmer">
{$i18n.t('Loading Kokoro.js...')}
{TTSModelProgress && TTSModelProgress.status === 'progress'
? `(${Math.round(TTSModelProgress.progress * 10) / 10}%)`
: ''}
</div>
</div>
<div class="text-xs text-gray-500">
{$i18n.t('Please do not close the settings page while loading the model.')}
</div>
</div>
{/if}
{:else if $config.audio.tts.engine === ''}
<div>
<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
<div class="flex w-full">