refac: voice call

2025-04-26 01:00:20 +00:00 · 2024-06-13 20:15:23 -07:00 · 2024-06-13 20:15:23 -07:00 · 7f70de99d3
commit 7f70de99d3
parent 7ea572fdca
3 changed files with 407 additions and 358 deletions
--- a/src/lib/components/chat/Chat.svelte
+++ b/src/lib/components/chat/Chat.svelte
@ -30,6 +30,7 @@
 	import {
 		convertMessagesToHistory,
 		copyToClipboard,
+		extractSentencesForAudio,
 		promptTemplate,
 		splitStream
 	} from '$lib/utils';
@ -593,7 +594,15 @@
 				array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
 		);

-		eventTarget.dispatchEvent(new CustomEvent('chat:start'));
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:start', {
+				detail: {
+					id: responseMessageId
+				}
+			})
+		);
+
+		await tick();

 		const [res, controller] = await generateChatCompletion(localStorage.token, {
 			model: model.id,
@ -664,9 +673,23 @@
 									continue;
 								} else {
 									responseMessage.content += data.message.content;
-									eventTarget.dispatchEvent(
-										new CustomEvent('chat', { detail: { content: data.message.content } })
-									);
+
+									const sentences = extractSentencesForAudio(responseMessage.content);
+									sentences.pop();
+
+									// dispatch only last sentence and make sure it hasn't been dispatched before
+									if (
+										sentences.length > 0 &&
+										sentences[sentences.length - 1] !== responseMessage.lastSentence
+									) {
+										responseMessage.lastSentence = sentences[sentences.length - 1];
+										eventTarget.dispatchEvent(
+											new CustomEvent('chat', {
+												detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+											})
+										);
+									}
+
 									messages = messages;
 								}
 							} else {
@ -760,7 +783,23 @@

 		stopResponseFlag = false;
 		await tick();
-		eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
+
+		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
+		if (lastSentence) {
+			eventTarget.dispatchEvent(
+				new CustomEvent('chat', {
+					detail: { id: responseMessageId, content: lastSentence }
+				})
+			);
+		}
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:finish', {
+				detail: {
+					id: responseMessageId,
+					content: responseMessage.content
+				}
+			})
+		);

 		if (autoScroll) {
 			scrollToBottom();
@ -802,7 +841,14 @@

 		scrollToBottom();

-		eventTarget.dispatchEvent(new CustomEvent('chat:start'));
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:start', {
+				detail: {
+					id: responseMessageId
+				}
+			})
+		);
+		await tick();

 		try {
 			const [res, controller] = await generateOpenAIChatCompletion(
@ -924,7 +970,23 @@
 						continue;
 					} else {
 						responseMessage.content += value;
-						eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
+
+						const sentences = extractSentencesForAudio(responseMessage.content);
+						sentences.pop();
+
+						// dispatch only last sentence and make sure it hasn't been dispatched before
+						if (
+							sentences.length > 0 &&
+							sentences[sentences.length - 1] !== responseMessage.lastSentence
+						) {
+							responseMessage.lastSentence = sentences[sentences.length - 1];
+							eventTarget.dispatchEvent(
+								new CustomEvent('chat', {
+									detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
+								})
+							);
+						}
+
 						messages = messages;
 					}

@ -975,7 +1037,23 @@
 		stopResponseFlag = false;
 		await tick();

-		eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
+		let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
+		if (lastSentence) {
+			eventTarget.dispatchEvent(
+				new CustomEvent('chat', {
+					detail: { id: responseMessageId, content: lastSentence }
+				})
+			);
+		}
+
+		eventTarget.dispatchEvent(
+			new CustomEvent('chat:finish', {
+				detail: {
+					id: responseMessageId,
+					content: responseMessage.content
+				}
+			})
+		);

 		if (autoScroll) {
 			scrollToBottom();
@ -1207,14 +1285,18 @@
 	</title>
 </svelte:head>

-<CallOverlay
-	{submitPrompt}
-	{stopResponse}
-	bind:files
-	modelId={selectedModelIds?.at(0) ?? null}
-	chatId={$chatId}
-	{eventTarget}
-/>
+<audio id="audioElement" src="" style="display: none;" />
+
+{#if $showCallOverlay}
+	<CallOverlay
+		{submitPrompt}
+		{stopResponse}
+		bind:files
+		modelId={selectedModelIds?.at(0) ?? null}
+		chatId={$chatId}
+		{eventTarget}
+	/>
+{/if}

 {#if !chatIdProp || (loaded && chatIdProp)}
 	<div
--- a/src/lib/components/chat/MessageInput/CallOverlay.svelte
+++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte
@ -2,7 +2,12 @@
 	import { config, settings, showCallOverlay } from '$lib/stores';
 	import { onMount, tick, getContext } from 'svelte';

-	import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
+	import {
+		blobToFile,
+		calculateSHA256,
+		extractSentencesForAudio,
+		findWordIndices
+	} from '$lib/utils';
 	import { generateEmoji } from '$lib/apis';
 	import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';

@ -32,34 +37,7 @@
 	let camera = false;
 	let cameraStream = null;

-	let assistantSpeaking = false;
-
 	let chatStreaming = false;
-	let assistantMessage = '';
-	let assistantSentences = [];
-	let assistantSentenceAudios = {};
-	let assistantSentenceIdx = -1;
-
-	let audioQueue = [];
-	let emojiQueue = [];
-
-	$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
-		const lastIndex = mergedTexts.length - 1;
-		if (lastIndex >= 0) {
-			const previousText = mergedTexts[lastIndex];
-			const wordCount = previousText.split(/\s+/).length;
-			if (wordCount < 2) {
-				mergedTexts[lastIndex] = previousText + ' ' + currentText;
-			} else {
-				mergedTexts.push(currentText);
-			}
-		} else {
-			mergedTexts.push(currentText);
-		}
-		return mergedTexts;
-	}, []);
-
-	let currentUtterance = null;

 	let rmsLevel = 0;
 	let hasStartedSpeaking = false;
@ -170,75 +148,6 @@
 	const MIN_DECIBELS = -45;
 	const VISUALIZER_BUFFER_LENGTH = 300;

-	// Function to calculate the RMS level from time domain data
-	const calculateRMS = (data: Uint8Array) => {
-		let sumSquares = 0;
-		for (let i = 0; i < data.length; i++) {
-			const normalizedValue = (data[i] - 128) / 128; // Normalize the data
-			sumSquares += normalizedValue * normalizedValue;
-		}
-		return Math.sqrt(sumSquares / data.length);
-	};
-
-	const analyseAudio = (stream) => {
-		const audioContext = new AudioContext();
-		const audioStreamSource = audioContext.createMediaStreamSource(stream);
-
-		const analyser = audioContext.createAnalyser();
-		analyser.minDecibels = MIN_DECIBELS;
-		audioStreamSource.connect(analyser);
-
-		const bufferLength = analyser.frequencyBinCount;
-
-		const domainData = new Uint8Array(bufferLength);
-		const timeDomainData = new Uint8Array(analyser.fftSize);
-
-		let lastSoundTime = Date.now();
-		hasStartedSpeaking = false;
-
-		const detectSound = () => {
-			const processFrame = () => {
-				if (!mediaRecorder || !$showCallOverlay) {
-					return;
-				}
-
-				analyser.getByteTimeDomainData(timeDomainData);
-				analyser.getByteFrequencyData(domainData);
-
-				// Calculate RMS level from time domain data
-				rmsLevel = calculateRMS(timeDomainData);
-
-				// Check if initial speech/noise has started
-				const hasSound = domainData.some((value) => value > 0);
-				if (hasSound) {
-					hasStartedSpeaking = true;
-					lastSoundTime = Date.now();
-
-					// BIG RED TEXT
-					console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
-					stopAllAudio();
-				}
-
-				// Start silence detection only after initial speech/noise has been detected
-				if (hasStartedSpeaking) {
-					if (Date.now() - lastSoundTime > 2000) {
-						confirmed = true;
-
-						if (mediaRecorder) {
-							mediaRecorder.stop();
-						}
-					}
-				}
-
-				window.requestAnimationFrame(processFrame);
-			};
-
-			window.requestAnimationFrame(processFrame);
-		};
-
-		detectSound();
-	};
-
 	const transcribeHandler = async (audioBlob) => {
 		// Create a blob from the audio chunks

@ -260,174 +169,6 @@
 		}
 	};

-	const stopAllAudio = async () => {
-		interrupted = true;
-
-		if (chatStreaming) {
-			stopResponse();
-		}
-
-		if (currentUtterance) {
-			speechSynthesis.cancel();
-			currentUtterance = null;
-		}
-
-		await tick();
-		emojiQueue = [];
-		audioQueue = [];
-		await tick();
-
-		const audioElement = document.getElementById('audioElement');
-		if (audioElement) {
-			audioElement.pause();
-			audioElement.currentTime = 0;
-		}
-
-		assistantSpeaking = false;
-	};
-
-	const speakSpeechSynthesisHandler = (content) => {
-		if ($showCallOverlay) {
-			return new Promise((resolve) => {
-				let voices = [];
-				const getVoicesLoop = setInterval(async () => {
-					voices = await speechSynthesis.getVoices();
-					if (voices.length > 0) {
-						clearInterval(getVoicesLoop);
-
-						const voice =
-							voices
-								?.filter(
-									(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-								)
-								?.at(0) ?? undefined;
-
-						currentUtterance = new SpeechSynthesisUtterance(content);
-
-						if (voice) {
-							currentUtterance.voice = voice;
-						}
-
-						speechSynthesis.speak(currentUtterance);
-						currentUtterance.onend = async (e) => {
-							await new Promise((r) => setTimeout(r, 100));
-							resolve(e);
-						};
-					}
-				}, 100);
-			});
-		} else {
-			return Promise.resolve();
-		}
-	};
-
-	const playAudio = (audio) => {
-		if ($showCallOverlay) {
-			return new Promise((resolve) => {
-				const audioElement = document.getElementById('audioElement');
-
-				if (audioElement) {
-					audioElement.src = audio.src;
-					audioElement.muted = true;
-
-					audioElement
-						.play()
-						.then(() => {
-							audioElement.muted = false;
-						})
-						.catch((error) => {
-							console.error(error);
-						});
-
-					audioElement.onended = async (e) => {
-						await new Promise((r) => setTimeout(r, 100));
-						resolve(e);
-					};
-				}
-			});
-		} else {
-			return Promise.resolve();
-		}
-	};
-
-	const playAudioHandler = async () => {
-		console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
-		if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
-			assistantSpeaking = true;
-
-			if ($settings?.showEmojiInCall ?? false) {
-				if (emojiQueue.length > 0) {
-					emoji = emojiQueue.shift();
-					emojiQueue = emojiQueue;
-				}
-			}
-
-			const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
-			audioQueue = audioQueue;
-			await playAudio(audioToPlay);
-			assistantSpeaking = false;
-		}
-	};
-
-	const setContentAudio = async (content, idx) => {
-		if (assistantSentenceAudios[idx] === undefined) {
-			// Wait for the previous audio to be loaded
-			if (idx > 0) {
-				await new Promise((resolve) => {
-					const check = setInterval(() => {
-						if (
-							assistantSentenceAudios[idx - 1] !== undefined &&
-							assistantSentenceAudios[idx - 1] !== null
-						) {
-							clearInterval(check);
-							resolve();
-						}
-					}, 100);
-				});
-			}
-
-			assistantSentenceAudios[idx] = null;
-
-			if ($settings?.showEmojiInCall ?? false) {
-				const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
-
-				if (sentenceEmoji) {
-					// Big red text with content and emoji
-					console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
-
-					if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
-						emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
-						emojiQueue = emojiQueue;
-					}
-				}
-
-				await tick();
-			}
-
-			const res = await synthesizeOpenAISpeech(
-				localStorage.token,
-				$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-				content
-			).catch((error) => {
-				toast.error(error);
-				assistantSpeaking = false;
-				return null;
-			});
-
-			if (res) {
-				const blob = await res.blob();
-				const blobUrl = URL.createObjectURL(blob);
-				const audio = new Audio(blobUrl);
-				assistantSentenceAudios[idx] = audio;
-
-				console.log('%c%s', 'color: red; font-size: 20px;', content);
-
-				audioQueue.push(audio);
-				audioQueue = audioQueue;
-			}
-		}
-	};
-
 	const stopRecordingCallback = async (_continue = true) => {
 		if ($showCallOverlay) {
 			console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
@ -489,107 +230,315 @@
 		mediaRecorder.start();
 	};

-	const resetAssistantMessage = async () => {
-		interrupted = false;
-
-		assistantMessage = '';
-		assistantSentenceIdx = -1;
-		assistantSentenceAudios = {}; // Reset audio tracking
-		audioQueue = []; // Clear the audio queue
-		audioQueue = audioQueue;
-
-		emoji = null;
-		emojiQueue = [];
-		emojiQueue = emojiQueue;
+	// Function to calculate the RMS level from time domain data
+	const calculateRMS = (data: Uint8Array) => {
+		let sumSquares = 0;
+		for (let i = 0; i < data.length; i++) {
+			const normalizedValue = (data[i] - 128) / 128; // Normalize the data
+			sumSquares += normalizedValue * normalizedValue;
+		}
+		return Math.sqrt(sumSquares / data.length);
 	};

-	$: (async () => {
+	const analyseAudio = (stream) => {
+		const audioContext = new AudioContext();
+		const audioStreamSource = audioContext.createMediaStreamSource(stream);
+
+		const analyser = audioContext.createAnalyser();
+		analyser.minDecibels = MIN_DECIBELS;
+		audioStreamSource.connect(analyser);
+
+		const bufferLength = analyser.frequencyBinCount;
+
+		const domainData = new Uint8Array(bufferLength);
+		const timeDomainData = new Uint8Array(analyser.fftSize);
+
+		let lastSoundTime = Date.now();
+		hasStartedSpeaking = false;
+
+		const detectSound = () => {
+			const processFrame = () => {
+				if (!mediaRecorder || !$showCallOverlay) {
+					return;
+				}
+
+				analyser.getByteTimeDomainData(timeDomainData);
+				analyser.getByteFrequencyData(domainData);
+
+				// Calculate RMS level from time domain data
+				rmsLevel = calculateRMS(timeDomainData);
+
+				// Check if initial speech/noise has started
+				const hasSound = domainData.some((value) => value > 0);
+				if (hasSound) {
+					// BIG RED TEXT
+					console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
+
+					if (!hasStartedSpeaking) {
+						hasStartedSpeaking = true;
+						stopAllAudio();
+					}
+
+					lastSoundTime = Date.now();
+				}
+
+				// Start silence detection only after initial speech/noise has been detected
+				if (hasStartedSpeaking) {
+					if (Date.now() - lastSoundTime > 2000) {
+						confirmed = true;
+
+						if (mediaRecorder) {
+							mediaRecorder.stop();
+						}
+					}
+				}
+
+				window.requestAnimationFrame(processFrame);
+			};
+
+			window.requestAnimationFrame(processFrame);
+		};
+
+		detectSound();
+	};
+
+	let finishedMessages = {};
+	let currentMessageId = null;
+	let currentUtterance = null;
+
+	const speakSpeechSynthesisHandler = (content) => {
 		if ($showCallOverlay) {
-			await resetAssistantMessage();
-			await tick();
-			startRecording();
+			return new Promise((resolve) => {
+				let voices = [];
+				const getVoicesLoop = setInterval(async () => {
+					voices = await speechSynthesis.getVoices();
+					if (voices.length > 0) {
+						clearInterval(getVoicesLoop);
+
+						const voice =
+							voices
+								?.filter(
+									(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+								)
+								?.at(0) ?? undefined;
+
+						currentUtterance = new SpeechSynthesisUtterance(content);
+
+						if (voice) {
+							currentUtterance.voice = voice;
+						}
+
+						speechSynthesis.speak(currentUtterance);
+						currentUtterance.onend = async (e) => {
+							await new Promise((r) => setTimeout(r, 100));
+							resolve(e);
+						};
+					}
+				}, 100);
+			});
 		} else {
-			stopCamera();
-			stopAllAudio();
-			stopRecordingCallback(false);
+			return Promise.resolve();
 		}
-	})();
+	};

-	$: {
-		if (audioQueue.length > 0 && !assistantSpeaking) {
-			playAudioHandler();
-		}
-	}
+	const playAudio = (audio) => {
+		if ($showCallOverlay) {
+			return new Promise((resolve) => {
+				const audioElement = document.getElementById('audioElement');

-	onMount(() => {
-		eventTarget.addEventListener('chat:start', async (e) => {
-			if ($showCallOverlay) {
-				console.log('Chat start event:', e);
-				await resetAssistantMessage();
-				await tick();
-				chatStreaming = true;
-			}
-		});
+				if (audioElement) {
+					audioElement.src = audio.src;
+					audioElement.muted = true;

-		eventTarget.addEventListener('chat', async (e) => {
-			if ($showCallOverlay) {
-				const { content } = e.detail;
-				assistantMessage += content;
-				await tick();
+					audioElement
+						.play()
+						.then(() => {
+							audioElement.muted = false;
+						})
+						.catch((error) => {
+							console.error(error);
+						});

-				if (!interrupted) {
-					if ($config.audio.tts.engine !== '') {
-						assistantSentenceIdx = assistantSentences.length - 2;
-
-						if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
-							await tick();
-							setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
-						}
-					}
+					audioElement.onended = async (e) => {
+						await new Promise((r) => setTimeout(r, 100));
+						resolve(e);
+					};
 				}
+			});
+		} else {
+			return Promise.resolve();
+		}
+	};

-				chatStreaming = true;
+	const stopAllAudio = async () => {
+		interrupted = true;
+
+		if (chatStreaming) {
+			stopResponse();
+		}
+
+		if (currentUtterance) {
+			speechSynthesis.cancel();
+			currentUtterance = null;
+		}
+
+		const audioElement = document.getElementById('audioElement');
+		if (audioElement) {
+			audioElement.pause();
+			audioElement.currentTime = 0;
+		}
+	};
+
+	let audioAbortController = new AbortController();
+
+	// Audio cache map where key is the content and value is the Audio object.
+	const audioCache = new Map();
+	const fetchAudio = async (content) => {
+		if (!audioCache.has(content)) {
+			try {
+				const res = await synthesizeOpenAISpeech(
+					localStorage.token,
+					$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
+					content
+				).catch((error) => {
+					console.error(error);
+					return null;
+				});
+
+				if (res) {
+					const blob = await res.blob();
+					const blobUrl = URL.createObjectURL(blob);
+					audioCache.set(content, new Audio(blobUrl));
+				}
+			} catch (error) {
+				console.error('Error synthesizing speech:', error);
 			}
-		});
+		}
+		return audioCache.get(content);
+	};

-		eventTarget.addEventListener('chat:finish', async (e) => {
-			if ($showCallOverlay) {
-				chatStreaming = false;
-				loading = false;
+	let messages = {};

-				console.log('Chat finish event:', e);
-				await tick();
+	const monitorAndPlayAudio = async (id, signal) => {
+		while (!signal.aborted) {
+			if (messages[id] && messages[id].length > 0) {
+				// Retrieve the next content string from the queue
+				const content = messages[id].shift(); // Dequeues the content for playing

-				if (!interrupted) {
-					if ($config.audio.tts.engine !== '') {
-						for (const [idx, sentence] of assistantSentences.entries()) {
-							if (!assistantSentenceAudios[idx]) {
-								await tick();
-								setContentAudio(sentence, idx);
-							}
-						}
-					} else {
-						if ($settings?.showEmojiInCall ?? false) {
-							const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
+				if (audioCache.has(content)) {
+					// If content is available in the cache, play it
+					try {
+						console.log(
+							'%c%s',
+							'color: red; font-size: 20px;',
+							`Playing audio for content: ${content}`
+						);

-							if (res) {
-								console.log(res);
-								if (/\p{Extended_Pictographic}/u.test(res)) {
-									emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
-								}
-							}
+						const audio = audioCache.get(content);
+						await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
+						console.log(`Played audio for content: ${content}`);
+						await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
+					} catch (error) {
+						console.error('Error playing audio:', error);
+					}
+				} else {
+					// If not available in the cache, push it back to the queue and delay
+					messages[id].unshift(content); // Re-queue the content at the start
+					console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
+					await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
+				}
+			} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
+				// If the message is finished and there are no more messages to process, break the loop
+				break;
+			} else {
+				// No messages to process, sleep for a bit
+				await new Promise((resolve) => setTimeout(resolve, 200));
+			}
+		}
+		console.log(`Audio monitoring and playing stopped for message ID ${id}`);
+	};
+
+	onMount(async () => {
+		startRecording();
+
+		const chatStartHandler = async (e) => {
+			const { id } = e.detail;
+
+			chatStreaming = true;
+
+			if ($config.audio.tts.engine !== '') {
+				// set currentMessageId to id
+				if (currentMessageId !== id) {
+					console.log(`Received chat start event for message ID ${id}`);
+
+					currentMessageId = id;
+					if (audioAbortController) {
+						audioAbortController.abort();
+					}
+					audioAbortController = new AbortController();
+
+					// Start monitoring and playing audio for the message ID
+					monitorAndPlayAudio(id, audioAbortController.signal);
+				}
+			}
+		};
+
+		const chatEventHandler = async (e) => {
+			const { id, content } = e.detail;
+			// "id" here is message id
+			// if "id" is not the same as "currentMessageId" then do not process
+			// "content" here is a sentence from the assistant,
+			// there will be many sentences for the same "id"
+
+			if ($config.audio.tts.engine !== '') {
+				if (currentMessageId === id) {
+					console.log(`Received chat event for message ID ${id}: ${content}`);
+
+					try {
+						if (messages[id] === undefined) {
+							messages[id] = [content];
+						} else {
+							messages[id].push(content);
 						}

-						speakSpeechSynthesisHandler(assistantMessage);
+						console.log(content);
+
+						fetchAudio(content);
+					} catch (error) {
+						console.error('Failed to fetch or play audio:', error);
 					}
 				}
 			}
-		});
+		};
+
+		const chatFinishHandler = async (e) => {
+			const { id, content } = e.detail;
+			// "content" here is the entire message from the assistant
+
+			chatStreaming = false;
+
+			if ($config.audio.tts.engine !== '') {
+				finishedMessages[id] = true;
+			} else {
+				speakSpeechSynthesisHandler(content);
+			}
+		};
+
+		eventTarget.addEventListener('chat:start', chatStartHandler);
+		eventTarget.addEventListener('chat', chatEventHandler);
+		eventTarget.addEventListener('chat:finish', chatFinishHandler);
+
+		return async () => {
+			eventTarget.removeEventListener('chat:start', chatStartHandler);
+			eventTarget.removeEventListener('chat', chatEventHandler);
+			eventTarget.removeEventListener('chat:finish', chatFinishHandler);
+
+			await stopRecordingCallback(false);
+			await stopCamera();
+		};
 	});
 </script>

-<audio id="audioElement" src="" style="display: none;" />
-
 {#if $showCallOverlay}
 	<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
 		<div
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@ -443,6 +443,24 @@ export const extractSentences = (text) => {
 		.filter((sentence) => sentence !== '');
 };

+export const extractSentencesForAudio = (text) => {
+	return extractSentences(text).reduce((mergedTexts, currentText) => {
+		const lastIndex = mergedTexts.length - 1;
+		if (lastIndex >= 0) {
+			const previousText = mergedTexts[lastIndex];
+			const wordCount = previousText.split(/\s+/).length;
+			if (wordCount < 2) {
+				mergedTexts[lastIndex] = previousText + ' ' + currentText;
+			} else {
+				mergedTexts.push(currentText);
+			}
+		} else {
+			mergedTexts.push(currentText);
+		}
+		return mergedTexts;
+	}, []);
+};
+
 export const blobToFile = (blob, fileName) => {
 	// Create a new File object from the Blob
 	const file = new File([blob], fileName, { type: blob.type });