feat: Kokoro-js TTS support

2025-06-26 18:26:48 +00:00 · 2025-02-09 23:42:27 -08:00
parent a22d1d5410
commit 205ce635f6
5 changed files with 388 additions and 78 deletions
--- a/src/lib/components/chat/Settings/Audio.svelte
+++ b/src/lib/components/chat/Settings/Audio.svelte
@@ -1,11 +1,14 @@
 <script lang="ts">
 	import { toast } from 'svelte-sonner';
 	import { createEventDispatcher, onMount, getContext } from 'svelte';
+	import { KokoroTTS } from 'kokoro-js';

 	import { user, settings, config } from '$lib/stores';
 	import { getVoices as _getVoices } from '$lib/apis/audio';

 	import Switch from '$lib/components/common/Switch.svelte';
+	import { round } from '@huggingface/transformers';
+	import Spinner from '$lib/components/common/Spinner.svelte';
 	const dispatch = createEventDispatcher();

 	const i18n = getContext('i18n');
@@ -20,6 +23,13 @@

 	let STTEngine = '';

+	let TTSEngine = '';
+	let TTSEngineConfig = {};
+
+	let TTSModel = null;
+	let TTSModelProgress = null;
+	let TTSModelLoading = false;
+
 	let voices = [];
 	let voice = '';

@@ -28,23 +38,37 @@
 	const speedOptions = [2, 1.75, 1.5, 1.25, 1, 0.75, 0.5];

 	const getVoices = async () => {
-		if ($config.audio.tts.engine === '') {
-			const getVoicesLoop = setInterval(async () => {
-				voices = await speechSynthesis.getVoices();
+		if (TTSEngine === 'browser-kokoro') {
+			if (!TTSModel) {
+				await loadKokoro();
+			}

-				// do your loop
-				if (voices.length > 0) {
-					clearInterval(getVoicesLoop);
-				}
-			}, 100);
-		} else {
-			const res = await _getVoices(localStorage.token).catch((e) => {
-				toast.error(`${e}`);
+			voices = Object.entries(TTSModel.voices).map(([key, value]) => {
+				return {
+					id: key,
+					name: value.name,
+					localService: false
+				};
 			});
+		} else {
+			if ($config.audio.tts.engine === '') {
+				const getVoicesLoop = setInterval(async () => {
+					voices = await speechSynthesis.getVoices();

-			if (res) {
-				console.log(res);
-				voices = res.voices;
+					// do your loop
+					if (voices.length > 0) {
+						clearInterval(getVoicesLoop);
+					}
+				}, 100);
+			} else {
+				const res = await _getVoices(localStorage.token).catch((e) => {
+					toast.error(`${e}`);
+				});
+
+				if (res) {
+					console.log(res);
+					voices = res.voices;
+				}
 			}
 		}
 	};
@@ -67,6 +91,9 @@

 		STTEngine = $settings?.audio?.stt?.engine ?? '';

+		TTSEngine = $settings?.audio?.tts?.engine ?? '';
+		TTSEngineConfig = $settings?.audio?.tts?.engineConfig ?? {};
+
 		if ($settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice) {
 			voice = $settings?.audio?.tts?.voice ?? $config.audio.tts.voice ?? '';
 		} else {
@@ -77,6 +104,51 @@

 		await getVoices();
 	});
+
+	$: if (TTSEngine && TTSEngineConfig) {
+		onTTSEngineChange();
+	}
+
+	const onTTSEngineChange = async () => {
+		if (TTSEngine === 'browser-kokoro') {
+			await loadKokoro();
+		}
+	};
+
+	const loadKokoro = async () => {
+		if (TTSEngine === 'browser-kokoro') {
+			voices = [];
+
+			if (TTSEngineConfig?.dtype) {
+				TTSModel = null;
+				TTSModelProgress = null;
+				TTSModelLoading = true;
+
+				const model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX';
+
+				TTSModel = await KokoroTTS.from_pretrained(model_id, {
+					dtype: TTSEngineConfig.dtype, // Options: "fp32", "fp16", "q8", "q4", "q4f16"
+					device: !!navigator?.gpu ? 'webgpu' : 'wasm', // Detect WebGPU
+					progress_callback: (e) => {
+						TTSModelProgress = e;
+						console.log(e);
+					}
+				});
+
+				await getVoices();
+
+				// const rawAudio = await tts.generate(inputText, {
+				// 	// Use `tts.list_voices()` to list all available voices
+				// 	voice: voice
+				// });
+
+				// const blobUrl = URL.createObjectURL(await rawAudio.toBlob());
+				// const audio = new Audio(blobUrl);
+
+				// audio.play();
+			}
+		}
+	};
 </script>

 <form
@@ -88,6 +160,8 @@
 					engine: STTEngine !== '' ? STTEngine : undefined
 				},
 				tts: {
+					engine: TTSEngine !== '' ? TTSEngine : undefined,
+					engineConfig: TTSEngineConfig,
 					playbackRate: playbackRate,
 					voice: voice !== '' ? voice : undefined,
 					defaultVoice: $config?.audio?.tts?.voice ?? '',
@@ -142,6 +216,39 @@
 		<div>
 			<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>

+			<div class=" py-0.5 flex w-full justify-between">
+				<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
+				<div class="flex items-center relative">
+					<select
+						class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+						bind:value={TTSEngine}
+						placeholder="Select an engine"
+					>
+						<option value="">{$i18n.t('Default')}</option>
+						<option value="browser-kokoro">{$i18n.t('Kokoro.js (Browser)')}</option>
+					</select>
+				</div>
+			</div>
+
+			{#if TTSEngine === 'browser-kokoro'}
+				<div class=" py-0.5 flex w-full justify-between">
+					<div class=" self-center text-xs font-medium">{$i18n.t('Kokoro.js Dtype')}</div>
+					<div class="flex items-center relative">
+						<select
+							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							bind:value={TTSEngineConfig.dtype}
+							placeholder="Select dtype"
+						>
+							<option value="" disabled selected>Select dtype</option>
+							<option value="fp32">fp32</option>
+							<option value="fp16">fp16</option>
+							<option value="q8">q8</option>
+							<option value="q4">q4</option>
+						</select>
+					</div>
+				</div>
+			{/if}
+
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">{$i18n.t('Auto-playback response')}</div>

@@ -178,7 +285,46 @@

 		<hr class=" dark:border-gray-850" />

-		{#if $config.audio.tts.engine === ''}
+		{#if TTSEngine === 'browser-kokoro'}
+			{#if TTSModel}
+				<div>
+					<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
+					<div class="flex w-full">
+						<div class="flex-1">
+							<input
+								list="voice-list"
+								class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
+								bind:value={voice}
+								placeholder="Select a voice"
+							/>
+
+							<datalist id="voice-list">
+								{#each voices as voice}
+									<option value={voice.id}>{voice.name}</option>
+								{/each}
+							</datalist>
+						</div>
+					</div>
+				</div>
+			{:else}
+				<div>
+					<div class=" mb-2.5 text-sm font-medium flex gap-2 items-center">
+						<Spinner className="size-4" />
+
+						<div class=" text-sm font-medium shimmer">
+							{$i18n.t('Loading Kokoro.js...')}
+							{TTSModelProgress && TTSModelProgress.status === 'progress'
+								? `(${Math.round(TTSModelProgress.progress * 10) / 10}%)`
+								: ''}
+						</div>
+					</div>
+
+					<div class="text-xs text-gray-500">
+						{$i18n.t('Please do not close the settings page while loading the model.')}
+					</div>
+				</div>
+			{/if}
+		{:else if $config.audio.tts.engine === ''}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
 				<div class="flex w-full">