diff --git a/src/lib/components/chat/Messages/ResponseMessage.svelte b/src/lib/components/chat/Messages/ResponseMessage.svelte index f6a4b0bc0..c83980482 100644 --- a/src/lib/components/chat/Messages/ResponseMessage.svelte +++ b/src/lib/components/chat/Messages/ResponseMessage.svelte @@ -4,12 +4,18 @@ import { createEventDispatcher } from 'svelte'; import { onMount, tick, getContext } from 'svelte'; + import type { Writable } from 'svelte/store'; + import type { i18n as i18nType } from 'i18next'; const i18n = getContext>('i18n'); const dispatch = createEventDispatcher(); - import { config, models, settings, user } from '$lib/stores'; + import { createNewFeedback, getFeedbackById, updateFeedbackById } from '$lib/apis/evaluations'; + import { getChatById } from '$lib/apis/chats'; + import { generateTags } from '$lib/apis'; + + import { config, models, settings, TTSWorker, user } from '$lib/stores'; import { synthesizeOpenAISpeech } from '$lib/apis/audio'; import { imageGenerations } from '$lib/apis/images'; import { @@ -34,13 +40,8 @@ import Error from './Error.svelte'; import Citations from './Citations.svelte'; import CodeExecutions from './CodeExecutions.svelte'; - - import type { Writable } from 'svelte/store'; - import type { i18n as i18nType } from 'i18next'; import ContentRenderer from './ContentRenderer.svelte'; - import { createNewFeedback, getFeedbackById, updateFeedbackById } from '$lib/apis/evaluations'; - import { getChatById } from '$lib/apis/chats'; - import { generateTags } from '$lib/apis'; + import { KokoroWorker } from '$lib/workers/KokoroWorker'; interface MessageType { id: string; @@ -193,62 +194,7 @@ speaking = true; - if ($config.audio.tts.engine !== '') { - loadingSpeech = true; - - const messageContentParts: string[] = getMessageContentParts( - message.content, - $config?.audio?.tts?.split_on ?? 'punctuation' - ); - - if (!messageContentParts.length) { - console.log('No content to speak'); - toast.info($i18n.t('No content to speak')); - - speaking = false; - loadingSpeech = false; - return; - } - - console.debug('Prepared message content for TTS', messageContentParts); - - audioParts = messageContentParts.reduce( - (acc, _sentence, idx) => { - acc[idx] = null; - return acc; - }, - {} as typeof audioParts - ); - - let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately - - for (const [idx, sentence] of messageContentParts.entries()) { - const res = await synthesizeOpenAISpeech( - localStorage.token, - $settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice - ? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) - : $config?.audio?.tts?.voice, - sentence - ).catch((error) => { - console.error(error); - toast.error(`${error}`); - - speaking = false; - loadingSpeech = false; - }); - - if (res) { - const blob = await res.blob(); - const blobUrl = URL.createObjectURL(blob); - const audio = new Audio(blobUrl); - audio.playbackRate = $settings.audio?.tts?.playbackRate ?? 1; - - audioParts[idx] = audio; - loadingSpeech = false; - lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); - } - } - } else { + if ($config.audio.tts.engine === '') { let voices = []; const getVoicesLoop = setInterval(() => { voices = speechSynthesis.getVoices(); @@ -283,6 +229,99 @@ speechSynthesis.speak(speak); } }, 100); + } else { + loadingSpeech = true; + + const messageContentParts: string[] = getMessageContentParts( + message.content, + $config?.audio?.tts?.split_on ?? 'punctuation' + ); + + if (!messageContentParts.length) { + console.log('No content to speak'); + toast.info($i18n.t('No content to speak')); + + speaking = false; + loadingSpeech = false; + return; + } + + console.debug('Prepared message content for TTS', messageContentParts); + + audioParts = messageContentParts.reduce( + (acc, _sentence, idx) => { + acc[idx] = null; + return acc; + }, + {} as typeof audioParts + ); + + let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately + + if ($settings.audio?.tts?.engine === 'browser-kokoro') { + if (!$TTSWorker) { + await TTSWorker.set( + new KokoroWorker({ + dtype: $settings.audio?.tts?.engineConfig?.dtype ?? 'fp32' + }) + ); + + await $TTSWorker.init(); + } + + console.log($TTSWorker); + + for (const [idx, sentence] of messageContentParts.entries()) { + const blob = await $TTSWorker + .generate({ + text: sentence, + voice: $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice + }) + .catch((error) => { + console.error(error); + toast.error(`${error}`); + + speaking = false; + loadingSpeech = false; + }); + + if (blob) { + const audio = new Audio(blob); + audio.playbackRate = $settings.audio?.tts?.playbackRate ?? 1; + + audioParts[idx] = audio; + loadingSpeech = false; + lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); + } + } + } else { + for (const [idx, sentence] of messageContentParts.entries()) { + const res = await synthesizeOpenAISpeech( + localStorage.token, + $settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice + ? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) + : $config?.audio?.tts?.voice, + sentence + ).catch((error) => { + console.error(error); + toast.error(`${error}`); + + speaking = false; + loadingSpeech = false; + }); + + if (res) { + const blob = await res.blob(); + const blobUrl = URL.createObjectURL(blob); + const audio = new Audio(blobUrl); + audio.playbackRate = $settings.audio?.tts?.playbackRate ?? 1; + + audioParts[idx] = audio; + loadingSpeech = false; + lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); + } + } + } } }; diff --git a/src/lib/components/chat/Settings/Audio.svelte b/src/lib/components/chat/Settings/Audio.svelte index 3f9fa9335..ce9cae771 100644 --- a/src/lib/components/chat/Settings/Audio.svelte +++ b/src/lib/components/chat/Settings/Audio.svelte @@ -1,11 +1,14 @@
{$i18n.t('TTS Settings')}
+
+
{$i18n.t('Text-to-Speech Engine')}
+
+ +
+
+ + {#if TTSEngine === 'browser-kokoro'} +
+
{$i18n.t('Kokoro.js Dtype')}
+
+ +
+
+ {/if} +
{$i18n.t('Auto-playback response')}
@@ -178,7 +285,46 @@
- {#if $config.audio.tts.engine === ''} + {#if TTSEngine === 'browser-kokoro'} + {#if TTSModel} +
+
{$i18n.t('Set Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+ {:else} +
+
+ + +
+ {$i18n.t('Loading Kokoro.js...')} + {TTSModelProgress && TTSModelProgress.status === 'progress' + ? `(${Math.round(TTSModelProgress.progress * 10) / 10}%)` + : ''} +
+
+ +
+ {$i18n.t('Please do not close the settings page while loading the model.')} +
+
+ {/if} + {:else if $config.audio.tts.engine === ''}
{$i18n.t('Set Voice')}
diff --git a/src/lib/stores/index.ts b/src/lib/stores/index.ts index 1b8839556..f96670cb6 100644 --- a/src/lib/stores/index.ts +++ b/src/lib/stores/index.ts @@ -41,6 +41,8 @@ export const shortCodesToEmojis = writable( }, {}) ); +export const TTSWorker = writable(null); + export const chatId = writable(''); export const chatTitle = writable(''); diff --git a/src/lib/workers/KokoroWorker.ts b/src/lib/workers/KokoroWorker.ts new file mode 100644 index 000000000..e5cc4b930 --- /dev/null +++ b/src/lib/workers/KokoroWorker.ts @@ -0,0 +1,70 @@ +import WorkerInstance from '$lib/workers/kokoro.worker?worker'; + +export class KokoroWorker { + private worker: Worker | null = null; + private initialized: boolean = false; + private dtype: string; + + constructor(dtype: string = 'fp32') { + this.dtype = dtype; + } + + public async init() { + if (this.worker) { + console.warn('KokoroWorker is already initialized.'); + return; + } + + this.worker = new WorkerInstance(); + + return new Promise((resolve, reject) => { + this.worker!.onmessage = (event) => { + const { status, error } = event.data; + + if (status === 'init:complete') { + this.initialized = true; + resolve(); + } else if (status === 'init:error') { + console.error(error); + this.initialized = false; + reject(new Error(error)); + } + }; + + this.worker!.postMessage({ + type: 'init', + payload: { dtype: this.dtype } + }); + }); + } + + public async generate({ text, voice }: { text: string; voice: string }): Promise { + if (!this.initialized || !this.worker) { + throw new Error('KokoroTTS Worker is not initialized yet.'); + } + + return new Promise((resolve, reject) => { + this.worker.postMessage({ type: 'generate', payload: { text, voice } }); + + const handleMessage = (event: MessageEvent) => { + if (event.data.status === 'generate:complete') { + this.worker!.removeEventListener('message', handleMessage); + resolve(event.data.audioUrl); + } else if (event.data.status === 'generate:error') { + this.worker!.removeEventListener('message', handleMessage); + reject(new Error(event.data.error)); + } + }; + + this.worker.addEventListener('message', handleMessage); + }); + } + + public terminate() { + if (this.worker) { + this.worker.terminate(); + this.worker = null; + this.initialized = false; + } + } +} diff --git a/src/lib/workers/kokoro.worker.ts b/src/lib/workers/kokoro.worker.ts new file mode 100644 index 000000000..39277330f --- /dev/null +++ b/src/lib/workers/kokoro.worker.ts @@ -0,0 +1,53 @@ +import { KokoroTTS } from 'kokoro-js'; + +let tts; +let isInitialized = false; // Flag to track initialization status +const DEFAULT_MODEL_ID = 'onnx-community/Kokoro-82M-v1.0-ONNX'; // Default model + +self.onmessage = async (event) => { + const { type, payload } = event.data; + + if (type === 'init') { + let { model_id, dtype } = payload; + model_id = model_id || DEFAULT_MODEL_ID; // Use default model if none provided + + self.postMessage({ status: 'init:start' }); + + try { + tts = await KokoroTTS.from_pretrained(model_id, { + dtype, + device: !!navigator?.gpu ? 'webgpu' : 'wasm' // Detect WebGPU + }); + isInitialized = true; // Mark as initialized after successful loading + self.postMessage({ status: 'init:complete' }); + } catch (error) { + isInitialized = false; // Ensure it's marked as false on failure + self.postMessage({ status: 'init:error', error: error.message }); + } + } + + if (type === 'generate') { + if (!isInitialized || !tts) { + // Ensure model is initialized + self.postMessage({ status: 'generate:error', error: 'TTS model not initialized' }); + return; + } + + const { text, voice } = payload; + self.postMessage({ status: 'generate:start' }); + + try { + const rawAudio = await tts.generate(text, { voice }); + const blob = await rawAudio.toBlob(); + const blobUrl = URL.createObjectURL(blob); + self.postMessage({ status: 'generate:complete', audioUrl: blobUrl }); + } catch (error) { + self.postMessage({ status: 'generate:error', error: error.message }); + } + } + + if (type === 'status') { + // Respond with the current initialization status + self.postMessage({ status: 'status:check', initialized: isInitialized }); + } +};