refac: voice call

This commit is contained in:
Timothy J. Baek 2024-06-13 20:15:23 -07:00
parent 7ea572fdca
commit 7f70de99d3
3 changed files with 407 additions and 358 deletions

View File

@ -30,6 +30,7 @@
import { import {
convertMessagesToHistory, convertMessagesToHistory,
copyToClipboard, copyToClipboard,
extractSentencesForAudio,
promptTemplate, promptTemplate,
splitStream splitStream
} from '$lib/utils'; } from '$lib/utils';
@ -593,7 +594,15 @@
array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
); );
eventTarget.dispatchEvent(new CustomEvent('chat:start')); eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
const [res, controller] = await generateChatCompletion(localStorage.token, { const [res, controller] = await generateChatCompletion(localStorage.token, {
model: model.id, model: model.id,
@ -664,9 +673,23 @@
continue; continue;
} else { } else {
responseMessage.content += data.message.content; responseMessage.content += data.message.content;
eventTarget.dispatchEvent(
new CustomEvent('chat', { detail: { content: data.message.content } }) const sentences = extractSentencesForAudio(responseMessage.content);
); sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages; messages = messages;
} }
} else { } else {
@ -760,7 +783,23 @@
stopResponseFlag = false; stopResponseFlag = false;
await tick(); await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) { if (autoScroll) {
scrollToBottom(); scrollToBottom();
@ -802,7 +841,14 @@
scrollToBottom(); scrollToBottom();
eventTarget.dispatchEvent(new CustomEvent('chat:start')); eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
try { try {
const [res, controller] = await generateOpenAIChatCompletion( const [res, controller] = await generateOpenAIChatCompletion(
@ -924,7 +970,23 @@
continue; continue;
} else { } else {
responseMessage.content += value; responseMessage.content += value;
eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
const sentences = extractSentencesForAudio(responseMessage.content);
sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages; messages = messages;
} }
@ -975,7 +1037,23 @@
stopResponseFlag = false; stopResponseFlag = false;
await tick(); await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish')); let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) { if (autoScroll) {
scrollToBottom(); scrollToBottom();
@ -1207,14 +1285,18 @@
</title> </title>
</svelte:head> </svelte:head>
<CallOverlay <audio id="audioElement" src="" style="display: none;" />
{submitPrompt}
{stopResponse} {#if $showCallOverlay}
bind:files <CallOverlay
modelId={selectedModelIds?.at(0) ?? null} {submitPrompt}
chatId={$chatId} {stopResponse}
{eventTarget} bind:files
/> modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId}
{eventTarget}
/>
{/if}
{#if !chatIdProp || (loaded && chatIdProp)} {#if !chatIdProp || (loaded && chatIdProp)}
<div <div

View File

@ -2,7 +2,12 @@
import { config, settings, showCallOverlay } from '$lib/stores'; import { config, settings, showCallOverlay } from '$lib/stores';
import { onMount, tick, getContext } from 'svelte'; import { onMount, tick, getContext } from 'svelte';
import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils'; import {
blobToFile,
calculateSHA256,
extractSentencesForAudio,
findWordIndices
} from '$lib/utils';
import { generateEmoji } from '$lib/apis'; import { generateEmoji } from '$lib/apis';
import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio'; import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
@ -32,34 +37,7 @@
let camera = false; let camera = false;
let cameraStream = null; let cameraStream = null;
let assistantSpeaking = false;
let chatStreaming = false; let chatStreaming = false;
let assistantMessage = '';
let assistantSentences = [];
let assistantSentenceAudios = {};
let assistantSentenceIdx = -1;
let audioQueue = [];
let emojiQueue = [];
$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
let currentUtterance = null;
let rmsLevel = 0; let rmsLevel = 0;
let hasStartedSpeaking = false; let hasStartedSpeaking = false;
@ -170,75 +148,6 @@
const MIN_DECIBELS = -45; const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300; const VISUALIZER_BUFFER_LENGTH = 300;
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
hasStartedSpeaking = false;
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
rmsLevel = calculateRMS(timeDomainData);
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
hasStartedSpeaking = true;
lastSoundTime = Date.now();
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
stopAllAudio();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
confirmed = true;
if (mediaRecorder) {
mediaRecorder.stop();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
const transcribeHandler = async (audioBlob) => { const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks // Create a blob from the audio chunks
@ -260,174 +169,6 @@
} }
}; };
const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
await tick();
emojiQueue = [];
audioQueue = [];
await tick();
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
let voices = [];
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (voices.length > 0) {
clearInterval(getVoicesLoop);
const voice =
voices
?.filter(
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
)
?.at(0) ?? undefined;
currentUtterance = new SpeechSynthesisUtterance(content);
if (voice) {
currentUtterance.voice = voice;
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
}, 100);
});
} else {
return Promise.resolve();
}
};
const playAudio = (audio) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.src = audio.src;
audioElement.muted = true;
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
console.error(error);
});
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
});
} else {
return Promise.resolve();
}
};
const playAudioHandler = async () => {
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
assistantSpeaking = true;
if ($settings?.showEmojiInCall ?? false) {
if (emojiQueue.length > 0) {
emoji = emojiQueue.shift();
emojiQueue = emojiQueue;
}
}
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
audioQueue = audioQueue;
await playAudio(audioToPlay);
assistantSpeaking = false;
}
};
const setContentAudio = async (content, idx) => {
if (assistantSentenceAudios[idx] === undefined) {
// Wait for the previous audio to be loaded
if (idx > 0) {
await new Promise((resolve) => {
const check = setInterval(() => {
if (
assistantSentenceAudios[idx - 1] !== undefined &&
assistantSentenceAudios[idx - 1] !== null
) {
clearInterval(check);
resolve();
}
}, 100);
});
}
assistantSentenceAudios[idx] = null;
if ($settings?.showEmojiInCall ?? false) {
const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
if (sentenceEmoji) {
// Big red text with content and emoji
console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
emojiQueue = emojiQueue;
}
}
await tick();
}
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantSentenceAudios[idx] = audio;
console.log('%c%s', 'color: red; font-size: 20px;', content);
audioQueue.push(audio);
audioQueue = audioQueue;
}
}
};
const stopRecordingCallback = async (_continue = true) => { const stopRecordingCallback = async (_continue = true) => {
if ($showCallOverlay) { if ($showCallOverlay) {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨'); console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
@ -489,107 +230,315 @@
mediaRecorder.start(); mediaRecorder.start();
}; };
const resetAssistantMessage = async () => { // Function to calculate the RMS level from time domain data
interrupted = false; const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
assistantMessage = ''; for (let i = 0; i < data.length; i++) {
assistantSentenceIdx = -1; const normalizedValue = (data[i] - 128) / 128; // Normalize the data
assistantSentenceAudios = {}; // Reset audio tracking sumSquares += normalizedValue * normalizedValue;
audioQueue = []; // Clear the audio queue }
audioQueue = audioQueue; return Math.sqrt(sumSquares / data.length);
emoji = null;
emojiQueue = [];
emojiQueue = emojiQueue;
}; };
$: (async () => { const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
hasStartedSpeaking = false;
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
rmsLevel = calculateRMS(timeDomainData);
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
if (!hasStartedSpeaking) {
hasStartedSpeaking = true;
stopAllAudio();
}
lastSoundTime = Date.now();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
confirmed = true;
if (mediaRecorder) {
mediaRecorder.stop();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
let finishedMessages = {};
let currentMessageId = null;
let currentUtterance = null;
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) { if ($showCallOverlay) {
await resetAssistantMessage(); return new Promise((resolve) => {
await tick(); let voices = [];
startRecording(); const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (voices.length > 0) {
clearInterval(getVoicesLoop);
const voice =
voices
?.filter(
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
)
?.at(0) ?? undefined;
currentUtterance = new SpeechSynthesisUtterance(content);
if (voice) {
currentUtterance.voice = voice;
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
}, 100);
});
} else { } else {
stopCamera(); return Promise.resolve();
stopAllAudio();
stopRecordingCallback(false);
} }
})(); };
$: { const playAudio = (audio) => {
if (audioQueue.length > 0 && !assistantSpeaking) { if ($showCallOverlay) {
playAudioHandler(); return new Promise((resolve) => {
} const audioElement = document.getElementById('audioElement');
}
onMount(() => { if (audioElement) {
eventTarget.addEventListener('chat:start', async (e) => { audioElement.src = audio.src;
if ($showCallOverlay) { audioElement.muted = true;
console.log('Chat start event:', e);
await resetAssistantMessage();
await tick();
chatStreaming = true;
}
});
eventTarget.addEventListener('chat', async (e) => { audioElement
if ($showCallOverlay) { .play()
const { content } = e.detail; .then(() => {
assistantMessage += content; audioElement.muted = false;
await tick(); })
.catch((error) => {
console.error(error);
});
if (!interrupted) { audioElement.onended = async (e) => {
if ($config.audio.tts.engine !== '') { await new Promise((r) => setTimeout(r, 100));
assistantSentenceIdx = assistantSentences.length - 2; resolve(e);
};
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
await tick();
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
}
}
} }
});
} else {
return Promise.resolve();
}
};
chatStreaming = true; const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
};
let audioAbortController = new AbortController();
// Audio cache map where key is the content and value is the Audio object.
const audioCache = new Map();
const fetchAudio = async (content) => {
if (!audioCache.has(content)) {
try {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
console.error(error);
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
audioCache.set(content, new Audio(blobUrl));
}
} catch (error) {
console.error('Error synthesizing speech:', error);
} }
}); }
return audioCache.get(content);
};
eventTarget.addEventListener('chat:finish', async (e) => { let messages = {};
if ($showCallOverlay) {
chatStreaming = false;
loading = false;
console.log('Chat finish event:', e); const monitorAndPlayAudio = async (id, signal) => {
await tick(); while (!signal.aborted) {
if (messages[id] && messages[id].length > 0) {
// Retrieve the next content string from the queue
const content = messages[id].shift(); // Dequeues the content for playing
if (!interrupted) { if (audioCache.has(content)) {
if ($config.audio.tts.engine !== '') { // If content is available in the cache, play it
for (const [idx, sentence] of assistantSentences.entries()) { try {
if (!assistantSentenceAudios[idx]) { console.log(
await tick(); '%c%s',
setContentAudio(sentence, idx); 'color: red; font-size: 20px;',
} `Playing audio for content: ${content}`
} );
} else {
if ($settings?.showEmojiInCall ?? false) {
const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
if (res) { const audio = audioCache.get(content);
console.log(res); await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
if (/\p{Extended_Pictographic}/u.test(res)) { console.log(`Played audio for content: ${content}`);
emoji = res.match(/\p{Extended_Pictographic}/gu)[0]; await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
} } catch (error) {
} console.error('Error playing audio:', error);
}
} else {
// If not available in the cache, push it back to the queue and delay
messages[id].unshift(content); // Re-queue the content at the start
console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
}
} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
// If the message is finished and there are no more messages to process, break the loop
break;
} else {
// No messages to process, sleep for a bit
await new Promise((resolve) => setTimeout(resolve, 200));
}
}
console.log(`Audio monitoring and playing stopped for message ID ${id}`);
};
onMount(async () => {
startRecording();
const chatStartHandler = async (e) => {
const { id } = e.detail;
chatStreaming = true;
if ($config.audio.tts.engine !== '') {
// set currentMessageId to id
if (currentMessageId !== id) {
console.log(`Received chat start event for message ID ${id}`);
currentMessageId = id;
if (audioAbortController) {
audioAbortController.abort();
}
audioAbortController = new AbortController();
// Start monitoring and playing audio for the message ID
monitorAndPlayAudio(id, audioAbortController.signal);
}
}
};
const chatEventHandler = async (e) => {
const { id, content } = e.detail;
// "id" here is message id
// if "id" is not the same as "currentMessageId" then do not process
// "content" here is a sentence from the assistant,
// there will be many sentences for the same "id"
if ($config.audio.tts.engine !== '') {
if (currentMessageId === id) {
console.log(`Received chat event for message ID ${id}: ${content}`);
try {
if (messages[id] === undefined) {
messages[id] = [content];
} else {
messages[id].push(content);
} }
speakSpeechSynthesisHandler(assistantMessage); console.log(content);
fetchAudio(content);
} catch (error) {
console.error('Failed to fetch or play audio:', error);
} }
} }
} }
}); };
const chatFinishHandler = async (e) => {
const { id, content } = e.detail;
// "content" here is the entire message from the assistant
chatStreaming = false;
if ($config.audio.tts.engine !== '') {
finishedMessages[id] = true;
} else {
speakSpeechSynthesisHandler(content);
}
};
eventTarget.addEventListener('chat:start', chatStartHandler);
eventTarget.addEventListener('chat', chatEventHandler);
eventTarget.addEventListener('chat:finish', chatFinishHandler);
return async () => {
eventTarget.removeEventListener('chat:start', chatStartHandler);
eventTarget.removeEventListener('chat', chatEventHandler);
eventTarget.removeEventListener('chat:finish', chatFinishHandler);
await stopRecordingCallback(false);
await stopCamera();
};
}); });
</script> </script>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay} {#if $showCallOverlay}
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden"> <div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
<div <div

View File

@ -443,6 +443,24 @@ export const extractSentences = (text) => {
.filter((sentence) => sentence !== ''); .filter((sentence) => sentence !== '');
}; };
export const extractSentencesForAudio = (text) => {
return extractSentences(text).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
};
export const blobToFile = (blob, fileName) => { export const blobToFile = (blob, fileName) => {
// Create a new File object from the Blob // Create a new File object from the Blob
const file = new File([blob], fileName, { type: blob.type }); const file = new File([blob], fileName, { type: blob.type });