diff --git a/src/lib/components/chat/MessageInput/CallOverlay.svelte b/src/lib/components/chat/MessageInput/CallOverlay.svelte
index 89283b84b..45fa47619 100644
--- a/src/lib/components/chat/MessageInput/CallOverlay.svelte
+++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte
@@ -2,8 +2,8 @@
 	import { settings, showCallOverlay } from '$lib/stores';
 	import { onMount, tick, getContext } from 'svelte';
 
-	import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils';
-	import { transcribeAudio } from '$lib/apis/audio';
+	import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
+	import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
 	import { toast } from 'svelte-sonner';
 
 	const i18n = getContext('i18n');
@@ -14,7 +14,8 @@
 	let confirmed = false;
 
 	let assistantSpeaking = false;
-	let assistantAudio = null;
+	let assistantAudio = {};
+	let assistantAudioIdx = null;
 
 	let rmsLevel = 0;
 	let hasStartedSpeaking = false;
@@ -26,6 +27,7 @@
 	let animationFrameId;
 
 	let speechRecognition;
+	let currentUtterance = null;
 
 	let mediaRecorder;
 	let audioChunks = [];
@@ -108,14 +110,7 @@
 				// Check if initial speech/noise has started
 				const hasSound = domainData.some((value) => value > 0);
 				if (hasSound) {
-					if (assistantSpeaking) {
-						speechSynthesis.cancel();
-
-						if (assistantAudio) {
-							assistantAudio.pause();
-							assistantAudio.currentTime = 0;
-						}
-					}
+					stopAllAudio();
 					hasStartedSpeaking = true;
 					lastSoundTime = Date.now();
 				}
@@ -140,6 +135,55 @@
 		detectSound();
 	};
 
+	const stopAllAudio = () => {
+		if (currentUtterance) {
+			speechSynthesis.cancel();
+			currentUtterance = null;
+		}
+		if (assistantAudio[assistantAudioIdx]) {
+			assistantAudio[assistantAudioIdx].pause();
+			assistantAudio[assistantAudioIdx].currentTime = 0;
+		}
+		assistantSpeaking = false;
+	};
+
+	const playAudio = (idx) => {
+		return new Promise((res) => {
+			assistantAudioIdx = idx;
+			const audio = assistantAudio[idx];
+			audio.play();
+			audio.onended = async (e) => {
+				await new Promise((r) => setTimeout(r, 300));
+
+				if (Object.keys(assistantAudio).length - 1 === idx) {
+					assistantSpeaking = false;
+				}
+
+				res(e);
+			};
+		});
+	};
+
+	const getOpenAISpeech = async (text) => {
+		const res = await synthesizeOpenAISpeech(
+			localStorage.token,
+			$settings?.audio?.speaker ?? 'alloy',
+			text,
+			$settings?.audio?.model ?? 'tts-1'
+		).catch((error) => {
+			toast.error(error);
+			assistantSpeaking = false;
+			return null;
+		});
+
+		if (res) {
+			const blob = await res.blob();
+			const blobUrl = URL.createObjectURL(blob);
+			const audio = new Audio(blobUrl);
+			assistantAudio = audio;
+		}
+	};
+
 	const transcribeHandler = async (audioBlob) => {
 		// Create a blob from the audio chunks
 
@@ -152,21 +196,68 @@
 		});
 
 		if (res) {
-			toast.success(res.text);
+			console.log(res.text);
 
 			const _responses = await submitPrompt(res.text);
 			console.log(_responses);
 
 			if (_responses.at(0)) {
-				const response = _responses[0];
-				if (response) {
-					assistantSpeaking = true;
+				const content = _responses[0];
+				if (content) {
+					assistantSpeakingHandler(content);
+				}
+			}
+		}
+	};
 
-					if ($settings?.audio?.TTSEngine ?? '') {
-						speechSynthesis.speak(new SpeechSynthesisUtterance(response));
+	const assistantSpeakingHandler = async (content) => {
+		assistantSpeaking = true;
+
+		if (($settings?.audio?.TTSEngine ?? '') == '') {
+			currentUtterance = new SpeechSynthesisUtterance(content);
+			speechSynthesis.speak(currentUtterance);
+		} else if ($settings?.audio?.TTSEngine === 'openai') {
+			console.log('openai');
+
+			const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
+				const lastIndex = mergedTexts.length - 1;
+				if (lastIndex >= 0) {
+					const previousText = mergedTexts[lastIndex];
+					const wordCount = previousText.split(/\s+/).length;
+					if (wordCount < 2) {
+						mergedTexts[lastIndex] = previousText + ' ' + currentText;
 					} else {
-						console.log('openai');
+						mergedTexts.push(currentText);
 					}
+				} else {
+					mergedTexts.push(currentText);
+				}
+				return mergedTexts;
+			}, []);
+
+			console.log(sentences);
+
+			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
+
+			for (const [idx, sentence] of sentences.entries()) {
+				const res = await synthesizeOpenAISpeech(
+					localStorage.token,
+					$settings?.audio?.speaker,
+					sentence,
+					$settings?.audio?.model
+				).catch((error) => {
+					toast.error(error);
+
+					assistantSpeaking = false;
+					return null;
+				});
+
+				if (res) {
+					const blob = await res.blob();
+					const blobUrl = URL.createObjectURL(blob);
+					const audio = new Audio(blobUrl);
+					assistantAudio[idx] = audio;
+					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
 				}
 			}
 		}
@@ -311,7 +402,7 @@
 								{#if loading}
 									Thinking...
 								{:else}
-									Listening... {Math.round(rmsLevel * 100)}
+									Listening...
 								{/if}
 							</div>
 						</button>