feat: Kokoro-js TTS support

This commit is contained in:
Timothy Jaeryang Baek
2025-02-09 23:42:27 -08:00
parent a22d1d5410
commit 205ce635f6
5 changed files with 388 additions and 78 deletions

View File

@@ -1,11 +1,14 @@
<script lang="ts">
import { toast } from 'svelte-sonner';
import { createEventDispatcher, onMount, getContext } from 'svelte';
import { KokoroTTS } from 'kokoro-js';
import { user, settings, config } from '$lib/stores';
import { getVoices as _getVoices } from '$lib/apis/audio';
import Switch from '$lib/components/common/Switch.svelte';
import { round } from '@huggingface/transformers';
import Spinner from '$lib/components/common/Spinner.svelte';
const dispatch = createEventDispatcher();
const i18n = getContext('i18n');
@@ -20,6 +23,13 @@
let STTEngine = '';
let TTSEngine = '';
let TTSEngineConfig = {};
let TTSModel = null;
let TTSModelProgress = null;
let TTSModelLoading = false;
let voices = [];
let voice = '';
@@ -28,23 +38,37 @@
const speedOptions = [2, 1.75, 1.5, 1.25, 1, 0.75, 0.5];
const getVoices = async () => {
if ($config.audio.tts.engine === '') {
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (TTSEngine === 'browser-kokoro') {
if (!TTSModel) {
await loadKokoro();
}
// do your loop
if (voices.length > 0) {
clearInterval(getVoicesLoop);
}
}, 100);
} else {
const res = await _getVoices(localStorage.token).catch((e) => {
toast.error(`${e}`);
voices = Object.entries(TTSModel.voices).map(([key, value]) => {
return {
id: key,
name: value.name,
localService: false
};
});
} else {
if ($config.audio.tts.engine === '') {
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (res) {
console.log(res);
voices = res.voices;
// do your loop
if (voices.length > 0) {
clearInterval(getVoicesLoop);
}
}, 100);
} else {
const res = await _getVoices(localStorage.token).catch((e) => {
toast.error(`${e}`);
});
if (res) {
console.log(res);
voices = res.voices;
}
}
}
};
@@ -67,6 +91,9 @@
STTEngine = $settings?.audio?.stt?.engine ?? '';
TTSEngine = $settings?.audio?.tts?.engine ?? '';
TTSEngineConfig = $settings?.audio?.tts?.engineConfig ?? {};
if ($settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice) {
voice = $settings?.audio?.tts?.voice ?? $config.audio.tts.voice ?? '';
} else {
@@ -77,6 +104,51 @@
await getVoices();
});
$: if (TTSEngine && TTSEngineConfig) {
onTTSEngineChange();
}
const onTTSEngineChange = async () => {
if (TTSEngine === 'browser-kokoro') {
await loadKokoro();
}
};
const loadKokoro = async () => {
if (TTSEngine === 'browser-kokoro') {
voices = [];
if (TTSEngineConfig?.dtype) {
TTSModel = null;
TTSModelProgress = null;
TTSModelLoading = true;
const model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX';
TTSModel = await KokoroTTS.from_pretrained(model_id, {
dtype: TTSEngineConfig.dtype, // Options: "fp32", "fp16", "q8", "q4", "q4f16"
device: !!navigator?.gpu ? 'webgpu' : 'wasm', // Detect WebGPU
progress_callback: (e) => {
TTSModelProgress = e;
console.log(e);
}
});
await getVoices();
// const rawAudio = await tts.generate(inputText, {
// // Use `tts.list_voices()` to list all available voices
// voice: voice
// });
// const blobUrl = URL.createObjectURL(await rawAudio.toBlob());
// const audio = new Audio(blobUrl);
// audio.play();
}
}
};
</script>
<form
@@ -88,6 +160,8 @@
engine: STTEngine !== '' ? STTEngine : undefined
},
tts: {
engine: TTSEngine !== '' ? TTSEngine : undefined,
engineConfig: TTSEngineConfig,
playbackRate: playbackRate,
voice: voice !== '' ? voice : undefined,
defaultVoice: $config?.audio?.tts?.voice ?? '',
@@ -142,6 +216,39 @@
<div>
<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={TTSEngine}
placeholder="Select an engine"
>
<option value="">{$i18n.t('Default')}</option>
<option value="browser-kokoro">{$i18n.t('Kokoro.js (Browser)')}</option>
</select>
</div>
</div>
{#if TTSEngine === 'browser-kokoro'}
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Kokoro.js Dtype')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={TTSEngineConfig.dtype}
placeholder="Select dtype"
>
<option value="" disabled selected>Select dtype</option>
<option value="fp32">fp32</option>
<option value="fp16">fp16</option>
<option value="q8">q8</option>
<option value="q4">q4</option>
</select>
</div>
</div>
{/if}
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Auto-playback response')}</div>
@@ -178,7 +285,46 @@
<hr class=" dark:border-gray-850" />
{#if $config.audio.tts.engine === ''}
{#if TTSEngine === 'browser-kokoro'}
{#if TTSModel}
<div>
<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
<div class="flex w-full">
<div class="flex-1">
<input
list="voice-list"
class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
bind:value={voice}
placeholder="Select a voice"
/>
<datalist id="voice-list">
{#each voices as voice}
<option value={voice.id}>{voice.name}</option>
{/each}
</datalist>
</div>
</div>
</div>
{:else}
<div>
<div class=" mb-2.5 text-sm font-medium flex gap-2 items-center">
<Spinner className="size-4" />
<div class=" text-sm font-medium shimmer">
{$i18n.t('Loading Kokoro.js...')}
{TTSModelProgress && TTSModelProgress.status === 'progress'
? `(${Math.round(TTSModelProgress.progress * 10) / 10}%)`
: ''}
</div>
</div>
<div class="text-xs text-gray-500">
{$i18n.t('Please do not close the settings page while loading the model.')}
</div>
</div>
{/if}
{:else if $config.audio.tts.engine === ''}
<div>
<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
<div class="flex w-full">