refac: voice call

This commit is contained in:
Timothy J. Baek 2024-06-13 20:15:23 -07:00
parent 7ea572fdca
commit 7f70de99d3
3 changed files with 407 additions and 358 deletions

View File

@ -30,6 +30,7 @@
import {
convertMessagesToHistory,
copyToClipboard,
extractSentencesForAudio,
promptTemplate,
splitStream
} from '$lib/utils';
@ -593,7 +594,15 @@
array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
);
eventTarget.dispatchEvent(new CustomEvent('chat:start'));
eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
const [res, controller] = await generateChatCompletion(localStorage.token, {
model: model.id,
@ -664,9 +673,23 @@
continue;
} else {
responseMessage.content += data.message.content;
eventTarget.dispatchEvent(
new CustomEvent('chat', { detail: { content: data.message.content } })
);
const sentences = extractSentencesForAudio(responseMessage.content);
sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages;
}
} else {
@ -760,7 +783,23 @@
stopResponseFlag = false;
await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) {
scrollToBottom();
@ -802,7 +841,14 @@
scrollToBottom();
eventTarget.dispatchEvent(new CustomEvent('chat:start'));
eventTarget.dispatchEvent(
new CustomEvent('chat:start', {
detail: {
id: responseMessageId
}
})
);
await tick();
try {
const [res, controller] = await generateOpenAIChatCompletion(
@ -924,7 +970,23 @@
continue;
} else {
responseMessage.content += value;
eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
const sentences = extractSentencesForAudio(responseMessage.content);
sentences.pop();
// dispatch only last sentence and make sure it hasn't been dispatched before
if (
sentences.length > 0 &&
sentences[sentences.length - 1] !== responseMessage.lastSentence
) {
responseMessage.lastSentence = sentences[sentences.length - 1];
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
})
);
}
messages = messages;
}
@ -975,7 +1037,23 @@
stopResponseFlag = false;
await tick();
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
if (lastSentence) {
eventTarget.dispatchEvent(
new CustomEvent('chat', {
detail: { id: responseMessageId, content: lastSentence }
})
);
}
eventTarget.dispatchEvent(
new CustomEvent('chat:finish', {
detail: {
id: responseMessageId,
content: responseMessage.content
}
})
);
if (autoScroll) {
scrollToBottom();
@ -1207,14 +1285,18 @@
</title>
</svelte:head>
<CallOverlay
{submitPrompt}
{stopResponse}
bind:files
modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId}
{eventTarget}
/>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay}
<CallOverlay
{submitPrompt}
{stopResponse}
bind:files
modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId}
{eventTarget}
/>
{/if}
{#if !chatIdProp || (loaded && chatIdProp)}
<div

View File

@ -2,7 +2,12 @@
import { config, settings, showCallOverlay } from '$lib/stores';
import { onMount, tick, getContext } from 'svelte';
import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
import {
blobToFile,
calculateSHA256,
extractSentencesForAudio,
findWordIndices
} from '$lib/utils';
import { generateEmoji } from '$lib/apis';
import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
@ -32,34 +37,7 @@
let camera = false;
let cameraStream = null;
let assistantSpeaking = false;
let chatStreaming = false;
let assistantMessage = '';
let assistantSentences = [];
let assistantSentenceAudios = {};
let assistantSentenceIdx = -1;
let audioQueue = [];
let emojiQueue = [];
$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
let currentUtterance = null;
let rmsLevel = 0;
let hasStartedSpeaking = false;
@ -170,75 +148,6 @@
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
hasStartedSpeaking = false;
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
rmsLevel = calculateRMS(timeDomainData);
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
hasStartedSpeaking = true;
lastSoundTime = Date.now();
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
stopAllAudio();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
confirmed = true;
if (mediaRecorder) {
mediaRecorder.stop();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
@ -260,174 +169,6 @@
}
};
const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
await tick();
emojiQueue = [];
audioQueue = [];
await tick();
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
let voices = [];
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (voices.length > 0) {
clearInterval(getVoicesLoop);
const voice =
voices
?.filter(
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
)
?.at(0) ?? undefined;
currentUtterance = new SpeechSynthesisUtterance(content);
if (voice) {
currentUtterance.voice = voice;
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
}, 100);
});
} else {
return Promise.resolve();
}
};
const playAudio = (audio) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.src = audio.src;
audioElement.muted = true;
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
console.error(error);
});
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
});
} else {
return Promise.resolve();
}
};
const playAudioHandler = async () => {
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
assistantSpeaking = true;
if ($settings?.showEmojiInCall ?? false) {
if (emojiQueue.length > 0) {
emoji = emojiQueue.shift();
emojiQueue = emojiQueue;
}
}
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
audioQueue = audioQueue;
await playAudio(audioToPlay);
assistantSpeaking = false;
}
};
const setContentAudio = async (content, idx) => {
if (assistantSentenceAudios[idx] === undefined) {
// Wait for the previous audio to be loaded
if (idx > 0) {
await new Promise((resolve) => {
const check = setInterval(() => {
if (
assistantSentenceAudios[idx - 1] !== undefined &&
assistantSentenceAudios[idx - 1] !== null
) {
clearInterval(check);
resolve();
}
}, 100);
});
}
assistantSentenceAudios[idx] = null;
if ($settings?.showEmojiInCall ?? false) {
const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
if (sentenceEmoji) {
// Big red text with content and emoji
console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
emojiQueue = emojiQueue;
}
}
await tick();
}
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantSentenceAudios[idx] = audio;
console.log('%c%s', 'color: red; font-size: 20px;', content);
audioQueue.push(audio);
audioQueue = audioQueue;
}
}
};
const stopRecordingCallback = async (_continue = true) => {
if ($showCallOverlay) {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
@ -489,107 +230,315 @@
mediaRecorder.start();
};
const resetAssistantMessage = async () => {
interrupted = false;
assistantMessage = '';
assistantSentenceIdx = -1;
assistantSentenceAudios = {}; // Reset audio tracking
audioQueue = []; // Clear the audio queue
audioQueue = audioQueue;
emoji = null;
emojiQueue = [];
emojiQueue = emojiQueue;
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
$: (async () => {
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
hasStartedSpeaking = false;
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
rmsLevel = calculateRMS(timeDomainData);
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
if (!hasStartedSpeaking) {
hasStartedSpeaking = true;
stopAllAudio();
}
lastSoundTime = Date.now();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
confirmed = true;
if (mediaRecorder) {
mediaRecorder.stop();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
let finishedMessages = {};
let currentMessageId = null;
let currentUtterance = null;
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) {
await resetAssistantMessage();
await tick();
startRecording();
return new Promise((resolve) => {
let voices = [];
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (voices.length > 0) {
clearInterval(getVoicesLoop);
const voice =
voices
?.filter(
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
)
?.at(0) ?? undefined;
currentUtterance = new SpeechSynthesisUtterance(content);
if (voice) {
currentUtterance.voice = voice;
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
}, 100);
});
} else {
stopCamera();
stopAllAudio();
stopRecordingCallback(false);
return Promise.resolve();
}
})();
};
$: {
if (audioQueue.length > 0 && !assistantSpeaking) {
playAudioHandler();
}
}
const playAudio = (audio) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
const audioElement = document.getElementById('audioElement');
onMount(() => {
eventTarget.addEventListener('chat:start', async (e) => {
if ($showCallOverlay) {
console.log('Chat start event:', e);
await resetAssistantMessage();
await tick();
chatStreaming = true;
}
});
if (audioElement) {
audioElement.src = audio.src;
audioElement.muted = true;
eventTarget.addEventListener('chat', async (e) => {
if ($showCallOverlay) {
const { content } = e.detail;
assistantMessage += content;
await tick();
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
console.error(error);
});
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
assistantSentenceIdx = assistantSentences.length - 2;
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
await tick();
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
}
}
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
});
} else {
return Promise.resolve();
}
};
chatStreaming = true;
const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
};
let audioAbortController = new AbortController();
// Audio cache map where key is the content and value is the Audio object.
const audioCache = new Map();
const fetchAudio = async (content) => {
if (!audioCache.has(content)) {
try {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
console.error(error);
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
audioCache.set(content, new Audio(blobUrl));
}
} catch (error) {
console.error('Error synthesizing speech:', error);
}
});
}
return audioCache.get(content);
};
eventTarget.addEventListener('chat:finish', async (e) => {
if ($showCallOverlay) {
chatStreaming = false;
loading = false;
let messages = {};
console.log('Chat finish event:', e);
await tick();
const monitorAndPlayAudio = async (id, signal) => {
while (!signal.aborted) {
if (messages[id] && messages[id].length > 0) {
// Retrieve the next content string from the queue
const content = messages[id].shift(); // Dequeues the content for playing
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
for (const [idx, sentence] of assistantSentences.entries()) {
if (!assistantSentenceAudios[idx]) {
await tick();
setContentAudio(sentence, idx);
}
}
} else {
if ($settings?.showEmojiInCall ?? false) {
const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
if (audioCache.has(content)) {
// If content is available in the cache, play it
try {
console.log(
'%c%s',
'color: red; font-size: 20px;',
`Playing audio for content: ${content}`
);
if (res) {
console.log(res);
if (/\p{Extended_Pictographic}/u.test(res)) {
emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
}
}
const audio = audioCache.get(content);
await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
console.log(`Played audio for content: ${content}`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
} catch (error) {
console.error('Error playing audio:', error);
}
} else {
// If not available in the cache, push it back to the queue and delay
messages[id].unshift(content); // Re-queue the content at the start
console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
}
} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
// If the message is finished and there are no more messages to process, break the loop
break;
} else {
// No messages to process, sleep for a bit
await new Promise((resolve) => setTimeout(resolve, 200));
}
}
console.log(`Audio monitoring and playing stopped for message ID ${id}`);
};
onMount(async () => {
startRecording();
const chatStartHandler = async (e) => {
const { id } = e.detail;
chatStreaming = true;
if ($config.audio.tts.engine !== '') {
// set currentMessageId to id
if (currentMessageId !== id) {
console.log(`Received chat start event for message ID ${id}`);
currentMessageId = id;
if (audioAbortController) {
audioAbortController.abort();
}
audioAbortController = new AbortController();
// Start monitoring and playing audio for the message ID
monitorAndPlayAudio(id, audioAbortController.signal);
}
}
};
const chatEventHandler = async (e) => {
const { id, content } = e.detail;
// "id" here is message id
// if "id" is not the same as "currentMessageId" then do not process
// "content" here is a sentence from the assistant,
// there will be many sentences for the same "id"
if ($config.audio.tts.engine !== '') {
if (currentMessageId === id) {
console.log(`Received chat event for message ID ${id}: ${content}`);
try {
if (messages[id] === undefined) {
messages[id] = [content];
} else {
messages[id].push(content);
}
speakSpeechSynthesisHandler(assistantMessage);
console.log(content);
fetchAudio(content);
} catch (error) {
console.error('Failed to fetch or play audio:', error);
}
}
}
});
};
const chatFinishHandler = async (e) => {
const { id, content } = e.detail;
// "content" here is the entire message from the assistant
chatStreaming = false;
if ($config.audio.tts.engine !== '') {
finishedMessages[id] = true;
} else {
speakSpeechSynthesisHandler(content);
}
};
eventTarget.addEventListener('chat:start', chatStartHandler);
eventTarget.addEventListener('chat', chatEventHandler);
eventTarget.addEventListener('chat:finish', chatFinishHandler);
return async () => {
eventTarget.removeEventListener('chat:start', chatStartHandler);
eventTarget.removeEventListener('chat', chatEventHandler);
eventTarget.removeEventListener('chat:finish', chatFinishHandler);
await stopRecordingCallback(false);
await stopCamera();
};
});
</script>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay}
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
<div

View File

@ -443,6 +443,24 @@ export const extractSentences = (text) => {
.filter((sentence) => sentence !== '');
};
export const extractSentencesForAudio = (text) => {
return extractSentences(text).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
if (wordCount < 2) {
mergedTexts[lastIndex] = previousText + ' ' + currentText;
} else {
mergedTexts.push(currentText);
}
} else {
mergedTexts.push(currentText);
}
return mergedTexts;
}, []);
};
export const blobToFile = (blob, fileName) => {
// Create a new File object from the Blob
const file = new File([blob], fileName, { type: blob.type });