refac: audio

2025-06-26 18:26:48 +00:00 · 2024-06-07 20:18:48 -07:00
parent da47c2dfa3
commit 55dc6c1b3b
20 changed files with 769 additions and 464 deletions
--- a/src/lib/components/chat/MessageInput/CallOverlay.svelte
+++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte
@@ -1,5 +1,5 @@
 <script lang="ts">
-	import { settings, showCallOverlay } from '$lib/stores';
+	import { config, settings, showCallOverlay } from '$lib/stores';
 	import { onMount, tick, getContext } from 'svelte';

 	import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
@@ -159,9 +159,9 @@
 	const getOpenAISpeech = async (text) => {
 		const res = await synthesizeOpenAISpeech(
 			localStorage.token,
-			$settings?.audio?.speaker ?? 'alloy',
+			$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
 			text,
-			$settings?.audio?.model ?? 'tts-1'
+			$settings?.audio?.tts?.model ?? $config?.audio?.tts?.model
 		).catch((error) => {
 			toast.error(error);
 			assistantSpeaking = false;
@@ -207,10 +207,29 @@
 	const assistantSpeakingHandler = async (content) => {
 		assistantSpeaking = true;

-		if (($settings?.audio?.TTSEngine ?? '') == '') {
-			currentUtterance = new SpeechSynthesisUtterance(content);
-			speechSynthesis.speak(currentUtterance);
-		} else if ($settings?.audio?.TTSEngine === 'openai') {
+		if (($config.audio.tts.engine ?? '') == '') {
+			let voices = [];
+			const getVoicesLoop = setInterval(async () => {
+				voices = await speechSynthesis.getVoices();
+				if (voices.length > 0) {
+					clearInterval(getVoicesLoop);
+
+					const voice =
+						voices
+							?.filter(
+								(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+							)
+							?.at(0) ?? undefined;
+
+					console.log($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice);
+					console.log(voices);
+
+					currentUtterance = new SpeechSynthesisUtterance(content);
+					currentUtterance.voice = voice;
+					speechSynthesis.speak(currentUtterance);
+				}
+			}, 100);
+		} else if ($config.audio.tts.engine === 'openai') {
 			console.log('openai');

 			const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
@@ -236,9 +255,9 @@
 			for (const [idx, sentence] of sentences.entries()) {
 				const res = await synthesizeOpenAISpeech(
 					localStorage.token,
-					$settings?.audio?.speaker,
+					$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
 					sentence,
-					$settings?.audio?.model
+					$settings?.audio?.tts?.model ?? $config?.audio?.tts?.model
 				).catch((error) => {
 					toast.error(error);

--- a/src/lib/components/chat/MessageInput/VoiceRecording.svelte
+++ b/src/lib/components/chat/MessageInput/VoiceRecording.svelte
@@ -169,7 +169,7 @@
 		mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
 		mediaRecorder.onstop = async () => {
 			console.log('Recording stopped');
-			if (($settings?.audio?.STTEngine ?? '') === 'web') {
+			if (($settings?.audio?.stt?.engine ?? '') === 'web') {
 				audioChunks = [];
 			} else {
 				if (confirmed) {
@@ -186,7 +186,7 @@
 		};
 		mediaRecorder.start();

-		if (($settings?.audio?.STTEngine ?? '') === 'web') {
+		if (($settings?.audio?.stt?.engine ?? '') === 'web') {
 			if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
 				// Create a SpeechRecognition object
 				speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@@ -213,7 +213,7 @@
 		} else {
 			speaking = true;

-			if ($settings?.audio?.TTSEngine === 'openai') {
+			if ($config.audio.tts.engine === 'openai') {
 				loadingSpeech = true;

 				const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
@@ -244,9 +244,9 @@
 				for (const [idx, sentence] of sentences.entries()) {
 					const res = await synthesizeOpenAISpeech(
 						localStorage.token,
-						$settings?.audio?.speaker,
+						$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
 						sentence,
-						$settings?.audio?.model
+						$settings?.audio?.tts?.model ?? $config?.audio?.tts?.model
 					).catch((error) => {
 						toast.error(error);

@@ -273,7 +273,11 @@
 						clearInterval(getVoicesLoop);

 						const voice =
-							voices?.filter((v) => v.name === $settings?.audio?.speaker)?.at(0) ?? undefined;
+							voices
+								?.filter(
+									(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+								)
+								?.at(0) ?? undefined;

 						const speak = new SpeechSynthesisUtterance(message.content);

--- a/src/lib/components/chat/Settings/Audio.svelte
+++ b/src/lib/components/chat/Settings/Audio.svelte
@@ -1,6 +1,5 @@
 <script lang="ts">
-	import { getAudioConfig, updateAudioConfig } from '$lib/apis/audio';
-	import { user, settings } from '$lib/stores';
+	import { user, settings, config } from '$lib/stores';
 	import { createEventDispatcher, onMount, getContext } from 'svelte';
 	import { toast } from 'svelte-sonner';
 	import Switch from '$lib/components/common/Switch.svelte';
@@ -11,26 +10,15 @@
 	export let saveSettings: Function;

 	// Audio
-
-	let OpenAIUrl = '';
-	let OpenAIKey = '';
-	let OpenAISpeaker = '';
-
-	let STTEngines = ['', 'openai'];
-	let STTEngine = '';
-
 	let conversationMode = false;
 	let speechAutoSend = false;
 	let responseAutoPlayback = false;
 	let nonLocalVoices = false;

-	let TTSEngines = ['', 'openai'];
-	let TTSEngine = '';
+	let STTEngine = '';

 	let voices = [];
-	let speaker = '';
-	let models = [];
-	let model = '';
+	let voice = '';

 	const getOpenAIVoices = () => {
 		voices = [
@@ -43,10 +31,6 @@
 		];
 	};

-	const getOpenAIVoicesModel = () => {
-		models = [{ name: 'tts-1' }, { name: 'tts-1-hd' }];
-	};
-
 	const getWebAPIVoices = () => {
 		const getVoicesLoop = setInterval(async () => {
 			voices = await speechSynthesis.getVoices();
@@ -58,21 +42,6 @@
 		}, 100);
 	};

-	const toggleConversationMode = async () => {
-		conversationMode = !conversationMode;
-
-		if (conversationMode) {
-			responseAutoPlayback = true;
-			speechAutoSend = true;
-		}
-
-		saveSettings({
-			conversationMode: conversationMode,
-			responseAutoPlayback: responseAutoPlayback,
-			speechAutoSend: speechAutoSend
-		});
-	};
-
 	const toggleResponseAutoPlayback = async () => {
 		responseAutoPlayback = !responseAutoPlayback;
 		saveSettings({ responseAutoPlayback: responseAutoPlayback });
@@ -83,76 +52,35 @@
 		saveSettings({ speechAutoSend: speechAutoSend });
 	};

-	const updateConfigHandler = async () => {
-		if (TTSEngine === 'openai') {
-			const res = await updateAudioConfig(localStorage.token, {
-				url: OpenAIUrl,
-				key: OpenAIKey,
-				model: model,
-				speaker: OpenAISpeaker
-			});
-
-			if (res) {
-				OpenAIUrl = res.OPENAI_API_BASE_URL;
-				OpenAIKey = res.OPENAI_API_KEY;
-				model = res.OPENAI_API_MODEL;
-				OpenAISpeaker = res.OPENAI_API_VOICE;
-			}
-		}
-	};
-
 	onMount(async () => {
 		conversationMode = $settings.conversationMode ?? false;
 		speechAutoSend = $settings.speechAutoSend ?? false;
 		responseAutoPlayback = $settings.responseAutoPlayback ?? false;

-		STTEngine = $settings?.audio?.STTEngine ?? '';
-		TTSEngine = $settings?.audio?.TTSEngine ?? '';
-		nonLocalVoices = $settings.audio?.nonLocalVoices ?? false;
-		speaker = $settings?.audio?.speaker ?? '';
-		model = $settings?.audio?.model ?? '';
+		STTEngine = $settings?.audio?.stt?.engine ?? '';
+		voice = $settings?.audio?.tts?.voice ?? $config.audio.tts.voice ?? '';
+		nonLocalVoices = $settings.audio?.tts?.nonLocalVoices ?? false;

-		if (TTSEngine === 'openai') {
+		if ($config.audio.tts.engine === 'openai') {
 			getOpenAIVoices();
-			getOpenAIVoicesModel();
 		} else {
 			getWebAPIVoices();
 		}
-
-		if ($user.role === 'admin') {
-			const res = await getAudioConfig(localStorage.token);
-
-			if (res) {
-				OpenAIUrl = res.OPENAI_API_BASE_URL;
-				OpenAIKey = res.OPENAI_API_KEY;
-				model = res.OPENAI_API_MODEL;
-				OpenAISpeaker = res.OPENAI_API_VOICE;
-				if (TTSEngine === 'openai') {
-					speaker = OpenAISpeaker;
-				}
-			}
-		}
 	});
 </script>

 <form
 	class="flex flex-col h-full justify-between space-y-3 text-sm"
 	on:submit|preventDefault={async () => {
-		if ($user.role === 'admin') {
-			await updateConfigHandler();
-		}
 		saveSettings({
 			audio: {
-				STTEngine: STTEngine !== '' ? STTEngine : undefined,
-				TTSEngine: TTSEngine !== '' ? TTSEngine : undefined,
-				speaker:
-					(TTSEngine === 'openai' ? OpenAISpeaker : speaker) !== ''
-						? TTSEngine === 'openai'
-							? OpenAISpeaker
-							: speaker
-						: undefined,
-				model: model !== '' ? model : undefined,
-				nonLocalVoices: nonLocalVoices
+				stt: {
+					engine: STTEngine !== '' ? STTEngine : undefined
+				},
+				tts: {
+					voice: $config.audio.tts.engine === 'openai' ? voice : voice !== '' ? voice : undefined,
+					nonLocalVoices: $config.audio.tts.engine === '' ? nonLocalVoices : undefined
+				}
 			}
 		});
 		dispatch('save');
@@ -162,31 +90,21 @@
 		<div>
 			<div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>

-			<div class=" py-0.5 flex w-full justify-between">
-				<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
-				<div class="flex items-center relative">
-					<select
-						class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
-						bind:value={STTEngine}
-						placeholder="Select an engine"
-						on:change={(e) => {
-							if (e.target.value !== '') {
-								navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) {
-									toast.error(
-										$i18n.t(`Permission denied when accessing microphone: {{error}}`, {
-											error: err
-										})
-									);
-									STTEngine = '';
-								});
-							}
-						}}
-					>
-						<option value="">{$i18n.t('Default (Whisper)')}</option>
-						<option value="web">{$i18n.t('Web API')}</option>
-					</select>
+			{#if $config.audio.stt.engine !== 'web'}
+				<div class=" py-0.5 flex w-full justify-between">
+					<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
+					<div class="flex items-center relative">
+						<select
+							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							bind:value={STTEngine}
+							placeholder="Select an engine"
+						>
+							<option value="">{$i18n.t('Default')}</option>
+							<option value="web">{$i18n.t('Web API')}</option>
+						</select>
+					</div>
 				</div>
-			</div>
+			{/if}

 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">
@@ -212,50 +130,6 @@
 		<div>
 			<div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>

-			<div class=" py-0.5 flex w-full justify-between">
-				<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
-				<div class="flex items-center relative">
-					<select
-						class=" dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
-						bind:value={TTSEngine}
-						placeholder="Select a mode"
-						on:change={(e) => {
-							if (e.target.value === 'openai') {
-								getOpenAIVoices();
-								OpenAISpeaker = 'alloy';
-								model = 'tts-1';
-							} else {
-								getWebAPIVoices();
-								speaker = '';
-							}
-						}}
-					>
-						<option value="">{$i18n.t('Default (Web API)')}</option>
-						<option value="openai">{$i18n.t('Open AI')}</option>
-					</select>
-				</div>
-			</div>
-
-			{#if $user.role === 'admin'}
-				{#if TTSEngine === 'openai'}
-					<div class="mt-1 flex gap-2 mb-1">
-						<input
-							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
-							placeholder={$i18n.t('API Base URL')}
-							bind:value={OpenAIUrl}
-							required
-						/>
-
-						<input
-							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
-							placeholder={$i18n.t('API Key')}
-							bind:value={OpenAIKey}
-							required
-						/>
-					</div>
-				{/if}
-			{/if}
-
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">{$i18n.t('Auto-playback response')}</div>

@@ -277,21 +151,21 @@

 		<hr class=" dark:border-gray-700" />

-		{#if TTSEngine === ''}
+		{#if $config.audio.tts.engine === ''}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
 				<div class="flex w-full">
 					<div class="flex-1">
 						<select
 							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
-							bind:value={speaker}
+							bind:value={voice}
 						>
-							<option value="" selected={speaker !== ''}>{$i18n.t('Default')}</option>
-							{#each voices.filter((v) => nonLocalVoices || v.localService === true) as voice}
+							<option value="" selected={voice !== ''}>{$i18n.t('Default')}</option>
+							{#each voices.filter((v) => nonLocalVoices || v.localService === true) as _voice}
 								<option
-									value={voice.name}
+									value={_voice.name}
 									class="bg-gray-100 dark:bg-gray-700"
-									selected={speaker === voice.name}>{voice.name}</option
+									selected={voice === _voice.name}>{_voice.name}</option
 								>
 							{/each}
 						</select>
@@ -307,7 +181,7 @@
 					</div>
 				</div>
 			</div>
-		{:else if TTSEngine === 'openai'}
+		{:else if $config.audio.tts.engine === 'openai'}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
 				<div class="flex w-full">
@@ -315,7 +189,7 @@
 						<input
 							list="voice-list"
 							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
-							bind:value={OpenAISpeaker}
+							bind:value={voice}
 							placeholder="Select a voice"
 						/>

@@ -327,25 +201,6 @@
 					</div>
 				</div>
 			</div>
-			<div>
-				<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Model')}</div>
-				<div class="flex w-full">
-					<div class="flex-1">
-						<input
-							list="model-list"
-							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
-							bind:value={model}
-							placeholder="Select a model"
-						/>
-
-						<datalist id="model-list">
-							{#each models as model}
-								<option value={model.name} />
-							{/each}
-						</datalist>
-					</div>
-				</div>
-			</div>
 		{/if}
 	</div>