diff --git a/src/lib/components/chat/MessageInput/CallOverlay.svelte b/src/lib/components/chat/MessageInput/CallOverlay.svelte index 89283b84b..45fa47619 100644 --- a/src/lib/components/chat/MessageInput/CallOverlay.svelte +++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte @@ -2,8 +2,8 @@ import { settings, showCallOverlay } from '$lib/stores'; import { onMount, tick, getContext } from 'svelte'; - import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils'; - import { transcribeAudio } from '$lib/apis/audio'; + import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils'; + import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio'; import { toast } from 'svelte-sonner'; const i18n = getContext('i18n'); @@ -14,7 +14,8 @@ let confirmed = false; let assistantSpeaking = false; - let assistantAudio = null; + let assistantAudio = {}; + let assistantAudioIdx = null; let rmsLevel = 0; let hasStartedSpeaking = false; @@ -26,6 +27,7 @@ let animationFrameId; let speechRecognition; + let currentUtterance = null; let mediaRecorder; let audioChunks = []; @@ -108,14 +110,7 @@ // Check if initial speech/noise has started const hasSound = domainData.some((value) => value > 0); if (hasSound) { - if (assistantSpeaking) { - speechSynthesis.cancel(); - - if (assistantAudio) { - assistantAudio.pause(); - assistantAudio.currentTime = 0; - } - } + stopAllAudio(); hasStartedSpeaking = true; lastSoundTime = Date.now(); } @@ -140,6 +135,55 @@ detectSound(); }; + const stopAllAudio = () => { + if (currentUtterance) { + speechSynthesis.cancel(); + currentUtterance = null; + } + if (assistantAudio[assistantAudioIdx]) { + assistantAudio[assistantAudioIdx].pause(); + assistantAudio[assistantAudioIdx].currentTime = 0; + } + assistantSpeaking = false; + }; + + const playAudio = (idx) => { + return new Promise((res) => { + assistantAudioIdx = idx; + const audio = assistantAudio[idx]; + audio.play(); + audio.onended = async (e) => { + await new Promise((r) => setTimeout(r, 300)); + + if (Object.keys(assistantAudio).length - 1 === idx) { + assistantSpeaking = false; + } + + res(e); + }; + }); + }; + + const getOpenAISpeech = async (text) => { + const res = await synthesizeOpenAISpeech( + localStorage.token, + $settings?.audio?.speaker ?? 'alloy', + text, + $settings?.audio?.model ?? 'tts-1' + ).catch((error) => { + toast.error(error); + assistantSpeaking = false; + return null; + }); + + if (res) { + const blob = await res.blob(); + const blobUrl = URL.createObjectURL(blob); + const audio = new Audio(blobUrl); + assistantAudio = audio; + } + }; + const transcribeHandler = async (audioBlob) => { // Create a blob from the audio chunks @@ -152,21 +196,68 @@ }); if (res) { - toast.success(res.text); + console.log(res.text); const _responses = await submitPrompt(res.text); console.log(_responses); if (_responses.at(0)) { - const response = _responses[0]; - if (response) { - assistantSpeaking = true; + const content = _responses[0]; + if (content) { + assistantSpeakingHandler(content); + } + } + } + }; - if ($settings?.audio?.TTSEngine ?? '') { - speechSynthesis.speak(new SpeechSynthesisUtterance(response)); + const assistantSpeakingHandler = async (content) => { + assistantSpeaking = true; + + if (($settings?.audio?.TTSEngine ?? '') == '') { + currentUtterance = new SpeechSynthesisUtterance(content); + speechSynthesis.speak(currentUtterance); + } else if ($settings?.audio?.TTSEngine === 'openai') { + console.log('openai'); + + const sentences = extractSentences(content).reduce((mergedTexts, currentText) => { + const lastIndex = mergedTexts.length - 1; + if (lastIndex >= 0) { + const previousText = mergedTexts[lastIndex]; + const wordCount = previousText.split(/\s+/).length; + if (wordCount < 2) { + mergedTexts[lastIndex] = previousText + ' ' + currentText; } else { - console.log('openai'); + mergedTexts.push(currentText); } + } else { + mergedTexts.push(currentText); + } + return mergedTexts; + }, []); + + console.log(sentences); + + let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately + + for (const [idx, sentence] of sentences.entries()) { + const res = await synthesizeOpenAISpeech( + localStorage.token, + $settings?.audio?.speaker, + sentence, + $settings?.audio?.model + ).catch((error) => { + toast.error(error); + + assistantSpeaking = false; + return null; + }); + + if (res) { + const blob = await res.blob(); + const blobUrl = URL.createObjectURL(blob); + const audio = new Audio(blobUrl); + assistantAudio[idx] = audio; + lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); } } } @@ -311,7 +402,7 @@ {#if loading} Thinking... {:else} - Listening... {Math.round(rmsLevel * 100)} + Listening... {/if}