From 3967c34261d2f8cabbe000403327d4fc6648eaba Mon Sep 17 00:00:00 2001
From: kiosion <hi@kio.dev>
Date: Sat, 24 Aug 2024 20:35:42 -0400
Subject: [PATCH] feat: Add control for how message content is split for TTS
 generation reqs

---
 backend/apps/audio/main.py                    |   6 +
 backend/config.py                             |   6 +
 backend/main.py                               |   1 +
 src/lib/apis/audio/index.ts                   |   6 +-
 .../components/admin/Settings/Audio.svelte    |  61 +++-
 .../chat/Messages/ResponseMessage.svelte      | 287 ++++++++++--------
 src/lib/types/index.ts                        |   6 +
 src/lib/utils/index.ts                        |  61 +++-
 8 files changed, 277 insertions(+), 157 deletions(-)

diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py
index d66a9fa11..46be15364 100644
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -37,6 +37,7 @@ from config import (
     AUDIO_TTS_ENGINE,
     AUDIO_TTS_MODEL,
     AUDIO_TTS_VOICE,
+    AUDIO_TTS_SPLIT_ON,
     AppConfig,
     CORS_ALLOW_ORIGIN,
 )
@@ -72,6 +73,7 @@ app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
 app.state.config.TTS_MODEL = AUDIO_TTS_MODEL
 app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
 app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
+app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
 
 # setting device type for whisper model
 whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
@@ -88,6 +90,7 @@ class TTSConfigForm(BaseModel):
     ENGINE: str
     MODEL: str
     VOICE: str
+    SPLIT_ON: str
 
 
 class STTConfigForm(BaseModel):
@@ -139,6 +142,7 @@ async def get_audio_config(user=Depends(get_admin_user)):
             "ENGINE": app.state.config.TTS_ENGINE,
             "MODEL": app.state.config.TTS_MODEL,
             "VOICE": app.state.config.TTS_VOICE,
+            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
         },
         "stt": {
             "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
@@ -159,6 +163,7 @@ async def update_audio_config(
     app.state.config.TTS_ENGINE = form_data.tts.ENGINE
     app.state.config.TTS_MODEL = form_data.tts.MODEL
     app.state.config.TTS_VOICE = form_data.tts.VOICE
+    app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
 
     app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
     app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
@@ -173,6 +178,7 @@ async def update_audio_config(
             "ENGINE": app.state.config.TTS_ENGINE,
             "MODEL": app.state.config.TTS_MODEL,
             "VOICE": app.state.config.TTS_VOICE,
+            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
         },
         "stt": {
             "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
diff --git a/backend/config.py b/backend/config.py
index adb2e1fd5..5cf8ba21a 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -1484,3 +1484,9 @@ AUDIO_TTS_VOICE = PersistentConfig(
     "audio.tts.voice",
     os.getenv("AUDIO_TTS_VOICE", "alloy"),  # OpenAI default voice
 )
+
+AUDIO_TTS_SPLIT_ON = PersistentConfig(
+    "AUDIO_TTS_SPLIT_ON",
+    "audio.tts.split_on",
+    os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"),
+)
diff --git a/backend/main.py b/backend/main.py
index b8ed68111..349afe25f 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -1924,6 +1924,7 @@ async def get_app_config(request: Request):
                     "tts": {
                         "engine": audio_app.state.config.TTS_ENGINE,
                         "voice": audio_app.state.config.TTS_VOICE,
+                        "split_on": audio_app.state.config.TTS_SPLIT_ON,
                     },
                     "stt": {
                         "engine": audio_app.state.config.STT_ENGINE,
diff --git a/src/lib/apis/audio/index.ts b/src/lib/apis/audio/index.ts
index af09af990..5cd6ab949 100644
--- a/src/lib/apis/audio/index.ts
+++ b/src/lib/apis/audio/index.ts
@@ -132,7 +132,11 @@ export const synthesizeOpenAISpeech = async (
 	return res;
 };
 
-export const getModels = async (token: string = '') => {
+interface AvailableModelsResponse {
+	models: { name: string; id: string }[] | { id: string }[];
+}
+
+export const getModels = async (token: string = ''): Promise<AvailableModelsResponse> => {
 	let error = null;
 
 	const res = await fetch(`${AUDIO_API_BASE_URL}/models`, {
diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte
index 7c3300568..7302558be 100644
--- a/src/lib/components/admin/Settings/Audio.svelte
+++ b/src/lib/components/admin/Settings/Audio.svelte
@@ -10,31 +10,36 @@
 		getModels as _getModels,
 		getVoices as _getVoices
 	} from '$lib/apis/audio';
-	import { user, settings, config } from '$lib/stores';
+	import { config } from '$lib/stores';
 
 	import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
 
-	const i18n = getContext('i18n');
+	import { TTS_RESPONSE_SPLIT } from '$lib/types';
 
-	export let saveHandler: Function;
+	import type { Writable } from 'svelte/store';
+	import type { i18n as i18nType } from 'i18next';
+
+	const i18n = getContext<Writable<i18nType>>('i18n');
+
+	export let saveHandler: () => void;
 
 	// Audio
-
 	let TTS_OPENAI_API_BASE_URL = '';
 	let TTS_OPENAI_API_KEY = '';
 	let TTS_API_KEY = '';
 	let TTS_ENGINE = '';
 	let TTS_MODEL = '';
 	let TTS_VOICE = '';
+	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
 
 	let STT_OPENAI_API_BASE_URL = '';
 	let STT_OPENAI_API_KEY = '';
 	let STT_ENGINE = '';
 	let STT_MODEL = '';
 
-	let voices = [];
-	let models = [];
-	let nonLocalVoices = false;
+	// eslint-disable-next-line no-undef
+	let voices: SpeechSynthesisVoice[] = [];
+	let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
 
 	const getModels = async () => {
 		if (TTS_ENGINE === '') {
@@ -53,8 +58,8 @@
 
 	const getVoices = async () => {
 		if (TTS_ENGINE === '') {
-			const getVoicesLoop = setInterval(async () => {
-				voices = await speechSynthesis.getVoices();
+			const getVoicesLoop = setInterval(() => {
+				voices = speechSynthesis.getVoices();
 
 				// do your loop
 				if (voices.length > 0) {
@@ -81,7 +86,8 @@
 				API_KEY: TTS_API_KEY,
 				ENGINE: TTS_ENGINE,
 				MODEL: TTS_MODEL,
-				VOICE: TTS_VOICE
+				VOICE: TTS_VOICE,
+				SPLIT_ON: TTS_SPLIT_ON
 			},
 			stt: {
 				OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
@@ -92,9 +98,8 @@
 		});
 
 		if (res) {
-			toast.success($i18n.t('Audio settings updated successfully'));
-
-			config.set(await getBackendConfig());
+			saveHandler();
+			getBackendConfig().then(config.set).catch(() => {});
 		}
 	};
 
@@ -111,6 +116,8 @@
 			TTS_MODEL = res.tts.MODEL;
 			TTS_VOICE = res.tts.VOICE;
 
+			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
+
 			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
 			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
 
@@ -139,7 +146,7 @@
 					<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
 					<div class="flex items-center relative">
 						<select
-							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							bind:value={STT_ENGINE}
 							placeholder="Select an engine"
 						>
@@ -195,7 +202,7 @@
 					<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
 					<div class="flex items-center relative">
 						<select
-							class=" dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							bind:value={TTS_ENGINE}
 							placeholder="Select a mode"
 							on:change={async (e) => {
@@ -203,7 +210,7 @@
 								await getVoices();
 								await getModels();
 
-								if (e.target.value === 'openai') {
+								if (e.target?.value === 'openai') {
 									TTS_VOICE = 'alloy';
 									TTS_MODEL = 'tts-1';
 								} else {
@@ -351,6 +358,28 @@
 						</div>
 					</div>
 				{/if}
+
+				<hr class="dark:border-gray-850 my-2" />
+
+				<div class="pt-0.5 flex w-full justify-between">
+					<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
+					<div class="flex items-center relative">
+						<select
+							class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							placeholder="Select how to split response text"
+							bind:value={TTS_SPLIT_ON}
+						>
+						{#each Object.values(TTS_RESPONSE_SPLIT) as split}
+							<option value={split}>{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option>
+						{/each}
+						</select>
+					</div>
+				</div>
+				<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
+					{$i18n.t(
+						"Choose how to split response text for speech synthesis. 'Punctuation' splits by sentences, 'paragraphs' splits by paragraphs, and 'none' sends the response as a single string."
+					)}
+				</div>
 			</div>
 		</div>
 	</div>
diff --git a/src/lib/components/chat/Messages/ResponseMessage.svelte b/src/lib/components/chat/Messages/ResponseMessage.svelte
index eac388eb0..21daadb2c 100644
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@@ -2,11 +2,10 @@
 	import { toast } from 'svelte-sonner';
 	import dayjs from 'dayjs';
 
-	import { fade } from 'svelte/transition';
 	import { createEventDispatcher } from 'svelte';
 	import { onMount, tick, getContext } from 'svelte';
 
-	const i18n = getContext('i18n');
+	const i18n = getContext<Writable<i18nType>>('i18n');
 
 	const dispatch = createEventDispatcher();
 
@@ -15,20 +14,18 @@
 	import { imageGenerations } from '$lib/apis/images';
 	import {
 		approximateToHumanReadable,
-		extractSentences,
-		replaceTokens,
-		processResponseContent
+		extractParagraphsForAudio,
+		extractSentencesForAudio,
+		prepareTextForTTS,
 	} from '$lib/utils';
 	import { WEBUI_BASE_URL } from '$lib/constants';
 
 	import Name from './Name.svelte';
 	import ProfileImage from './ProfileImage.svelte';
 	import Skeleton from './Skeleton.svelte';
-	import CodeBlock from './CodeBlock.svelte';
 	import Image from '$lib/components/common/Image.svelte';
 	import Tooltip from '$lib/components/common/Tooltip.svelte';
 	import RateComment from './RateComment.svelte';
-	import CitationsModal from '$lib/components/chat/Messages/CitationsModal.svelte';
 	import Spinner from '$lib/components/common/Spinner.svelte';
 	import WebSearchResults from './ResponseMessage/WebSearchResults.svelte';
 	import Sparkles from '$lib/components/icons/Sparkles.svelte';
@@ -36,7 +33,38 @@
 	import Error from './Error.svelte';
 	import Citations from './Citations.svelte';
 
-	export let message;
+	import type { Writable } from 'svelte/store';
+	import type { i18n as i18nType } from 'i18next';
+	import { TTS_RESPONSE_SPLIT } from '$lib/types';
+
+	interface MessageType {
+		id: string;
+		model: string;
+		content: string;
+		files?: { type: string; url: string }[];
+		timestamp: number;
+		role: string;
+		statusHistory?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; }[];
+		status?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; };
+		done: boolean;
+		error?: boolean | { content: string };
+		citations?: string[];
+		info?: {
+			openai?: boolean;
+			prompt_tokens?: number;
+			completion_tokens?: number;
+			total_tokens?: number;
+			eval_count?: number;
+			eval_duration?: number;
+			prompt_eval_count?: number;
+			prompt_eval_duration?: number;
+			total_duration?: number;
+			load_duration?: number;
+		};
+		annotation?: { type: string; rating: number; };
+	}
+
+	export let message: MessageType;
 	export let siblings;
 
 	export let isLastMessage = true;
@@ -60,28 +88,33 @@
 	let editedContent = '';
 	let editTextAreaElement: HTMLTextAreaElement;
 
-	let sentencesAudio = {};
-	let speaking = null;
-	let speakingIdx = null;
+	let audioParts: Record<number, HTMLAudioElement | null> = {};
+	let speaking = false;
+	let speakingIdx: number | undefined;
 
 	let loadingSpeech = false;
 	let generatingImage = false;
 
 	let showRateComment = false;
 
-	const playAudio = (idx) => {
-		return new Promise((res) => {
+	const playAudio = (idx: number) => {
+		return new Promise<void>((res) => {
 			speakingIdx = idx;
-			const audio = sentencesAudio[idx];
+			const audio = audioParts[idx];
+
+			if (!audio) {
+				return res();
+			}
+
 			audio.play();
-			audio.onended = async (e) => {
+			audio.onended = async () => {
 				await new Promise((r) => setTimeout(r, 300));
 
-				if (Object.keys(sentencesAudio).length - 1 === idx) {
-					speaking = null;
+				if (Object.keys(audioParts).length - 1 === idx) {
+					speaking = false;
 				}
 
-				res(e);
+				res();
 			};
 		});
 	};
@@ -91,113 +124,119 @@
 			try {
 				speechSynthesis.cancel();
 
-				sentencesAudio[speakingIdx].pause();
-				sentencesAudio[speakingIdx].currentTime = 0;
+				if (speakingIdx !== undefined && audioParts[speakingIdx]) {
+					audioParts[speakingIdx]!.pause();
+					audioParts[speakingIdx]!.currentTime = 0;
+				}
 			} catch {}
 
-			speaking = null;
-			speakingIdx = null;
-		} else {
-			if ((message?.content ?? '').trim() !== '') {
-				speaking = true;
+			speaking = false;
+			speakingIdx = undefined;
+			return;
+		}
 
-				if ($config.audio.tts.engine !== '') {
-					loadingSpeech = true;
+		if (!(message?.content ?? '').trim().length) {
+			toast.info($i18n.t('No content to speak'));
+			return;
+		}
 
-					const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
-						const lastIndex = mergedTexts.length - 1;
-						if (lastIndex >= 0) {
-							const previousText = mergedTexts[lastIndex];
-							const wordCount = previousText.split(/\s+/).length;
-							if (wordCount < 2) {
-								mergedTexts[lastIndex] = previousText + ' ' + currentText;
-							} else {
-								mergedTexts.push(currentText);
-							}
-						} else {
-							mergedTexts.push(currentText);
-						}
-						return mergedTexts;
-					}, []);
+		speaking = true;
 
-					console.log(sentences);
+		if ($config.audio.tts.engine !== '') {
+			loadingSpeech = true;
 
-					if (sentences.length > 0) {
-						sentencesAudio = sentences.reduce((a, e, i, arr) => {
-							a[i] = null;
-							return a;
-						}, {});
+			const preparedMessageContent: string[] = [];
 
-						let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
-
-						for (const [idx, sentence] of sentences.entries()) {
-							const res = await synthesizeOpenAISpeech(
-								localStorage.token,
-								$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
-									? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-									: $config?.audio?.tts?.voice,
-								sentence
-							).catch((error) => {
-								toast.error(error);
-
-								speaking = null;
-								loadingSpeech = false;
-
-								return null;
-							});
-
-							if (res) {
-								const blob = await res.blob();
-								const blobUrl = URL.createObjectURL(blob);
-								const audio = new Audio(blobUrl);
-								sentencesAudio[idx] = audio;
-								loadingSpeech = false;
-								lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
-							}
-						}
-					} else {
-						speaking = null;
-						loadingSpeech = false;
-					}
-				} else {
-					let voices = [];
-					const getVoicesLoop = setInterval(async () => {
-						voices = await speechSynthesis.getVoices();
-						if (voices.length > 0) {
-							clearInterval(getVoicesLoop);
-
-							const voice =
-								voices
-									?.filter(
-										(v) =>
-											v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-									)
-									?.at(0) ?? undefined;
-
-							console.log(voice);
-
-							const speak = new SpeechSynthesisUtterance(message.content);
-
-							console.log(speak);
-
-							speak.onend = () => {
-								speaking = null;
-								if ($settings.conversationMode) {
-									document.getElementById('voice-input-button')?.click();
-								}
-							};
-
-							if (voice) {
-								speak.voice = voice;
-							}
-
-							speechSynthesis.speak(speak);
-						}
-					}, 100);
-				}
-			} else {
-				toast.error($i18n.t('No content to speak'));
+			switch ($config.audio.tts.split_on) {
+				default:
+				case TTS_RESPONSE_SPLIT.PUNCTUATION:
+				preparedMessageContent.push(...extractSentencesForAudio(message.content));
+					break;
+				case TTS_RESPONSE_SPLIT.PARAGRAPHS:
+				preparedMessageContent.push(...extractParagraphsForAudio(message.content));
+					break;
+				case TTS_RESPONSE_SPLIT.NONE:
+				preparedMessageContent.push(prepareTextForTTS(message.content));
+					break;
 			}
+
+			if (!preparedMessageContent.length) {
+				console.log('No content to speak');
+				toast.info($i18n.t('No content to speak'));
+
+				speaking = false;
+				loadingSpeech = false;
+				return;
+			}
+
+			console.debug('Prepared message content for TTS', preparedMessageContent);
+
+			audioParts = preparedMessageContent.reduce((acc, _sentence, idx) => {
+				acc[idx] = null;
+				return acc;
+			}, {} as typeof audioParts);
+
+			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
+
+			for (const [idx, sentence] of preparedMessageContent.entries()) {
+				const res = await synthesizeOpenAISpeech(
+					localStorage.token,
+					$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
+						? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+						: $config?.audio?.tts?.voice,
+					sentence
+				).catch((error) => {
+					console.error(error);
+					toast.error(error);
+
+					speaking = false;
+					loadingSpeech = false;
+				});
+
+				if (res) {
+					const blob = await res.blob();
+					const blobUrl = URL.createObjectURL(blob);
+					const audio = new Audio(blobUrl);
+					audioParts[idx] = audio;
+					loadingSpeech = false;
+					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
+				}
+			}
+		} else {
+			let voices = [];
+			const getVoicesLoop = setInterval(() => {
+				voices = speechSynthesis.getVoices();
+				if (voices.length > 0) {
+					clearInterval(getVoicesLoop);
+
+					const voice =
+						voices
+							?.filter(
+								(v) =>
+									v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+							)
+							?.at(0) ?? undefined;
+
+					console.log(voice);
+
+					const speak = new SpeechSynthesisUtterance(message.content);
+
+					console.log(speak);
+
+					speak.onend = () => {
+						speaking = false;
+						if ($settings.conversationMode) {
+							document.getElementById('voice-input-button')?.click();
+						}
+					};
+
+					if (voice) {
+						speak.voice = voice;
+					}
+
+					speechSynthesis.speak(speak);
+				}
+			}, 100);
 		}
 	};
 
@@ -230,7 +269,7 @@
 		await tick();
 	};
 
-	const generateImage = async (message) => {
+	const generateImage = async (message: MessageType) => {
 		generatingImage = true;
 		const res = await imageGenerations(localStorage.token, message.content).catch((error) => {
 			toast.error(error);
@@ -285,7 +324,7 @@
 			</Name>
 
 			<div>
-				{#if (message?.files ?? []).filter((f) => f.type === 'image').length > 0}
+				{#if message?.files && message.files?.filter((f) => f.type === 'image').length > 0}
 					<div class="my-2.5 w-full flex overflow-x-auto gap-2 flex-wrap">
 						{#each message.files as file}
 							<div>
@@ -304,7 +343,7 @@
 								message?.statusHistory ?? [...(message?.status ? [message?.status] : [])]
 							).at(-1)}
 							<div class="flex items-center gap-2 pt-0.5 pb-1">
-								{#if status.done === false}
+								{#if status?.done === false}
 									<div class="">
 										<Spinner className="size-4" />
 									</div>
@@ -521,7 +560,7 @@
 											: 'invisible group-hover:visible'} p-1.5 hover:bg-black/5 dark:hover:bg-white/5 rounded-lg dark:hover:text-white hover:text-black transition"
 										on:click={() => {
 											if (!loadingSpeech) {
-												toggleSpeakMessage(message);
+												toggleSpeakMessage();
 											}
 										}}
 									>
@@ -661,7 +700,7 @@
 													`${
 														Math.round(
 															((message.info.eval_count ?? 0) /
-																(message.info.eval_duration / 1000000000)) *
+																((message.info.eval_duration ?? 0) / 1000000000)) *
 																100
 														) / 100
 													} tokens` ?? 'N/A'
@@ -669,7 +708,7 @@
 					prompt_token/s: ${
 						Math.round(
 							((message.info.prompt_eval_count ?? 0) /
-								(message.info.prompt_eval_duration / 1000000000)) *
+								((message.info.prompt_eval_duration ?? 0) / 1000000000)) *
 								100
 						) / 100 ?? 'N/A'
 					} tokens<br/>
@@ -688,7 +727,7 @@
 		            eval_duration: ${
 									Math.round(((message.info.eval_duration ?? 0) / 1000000) * 100) / 100 ?? 'N/A'
 								}ms<br/>
-		            approximate_total: ${approximateToHumanReadable(message.info.total_duration)}`}
+		            approximate_total: ${approximateToHumanReadable((message.info.total_duration ?? 0))}`}
 										placement="top"
 									>
 										<Tooltip content={$i18n.t('Generation Info')} placement="bottom">
diff --git a/src/lib/types/index.ts b/src/lib/types/index.ts
index 2d9156c8d..5b20e4e8b 100644
--- a/src/lib/types/index.ts
+++ b/src/lib/types/index.ts
@@ -7,3 +7,9 @@ export type Banner = {
 	dismissible?: boolean;
 	timestamp: number;
 };
+
+export enum TTS_RESPONSE_SPLIT {
+	PUNCTUATION = 'punctuation',
+	PARAGRAPHS = 'paragraphs',
+	NONE = 'none',
+}
diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts
index 995712dfa..3acedd2ba 100644
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -408,7 +408,7 @@ const convertOpenAIMessages = (convo) => {
 	let currentId = '';
 	let lastId = null;
 
-	for (let message_id in mapping) {
+	for (const message_id in mapping) {
 		const message = mapping[message_id];
 		currentId = message_id;
 		try {
@@ -442,7 +442,7 @@ const convertOpenAIMessages = (convo) => {
 		}
 	}
 
-	let history = {};
+	const history: Record<PropertyKey, (typeof messages)[number]> = {};
 	messages.forEach((obj) => (history[obj.id] = obj));
 
 	const chat = {
@@ -481,7 +481,7 @@ const validateChat = (chat) => {
 	}
 
 	// Every message's content should be a string
-	for (let message of messages) {
+	for (const message of messages) {
 		if (typeof message.content !== 'string') {
 			return false;
 		}
@@ -494,7 +494,7 @@ export const convertOpenAIChats = (_chats) => {
 	// Create a list of dictionaries with each conversation from import
 	const chats = [];
 	let failed = 0;
-	for (let convo of _chats) {
+	for (const convo of _chats) {
 		const chat = convertOpenAIMessages(convo);
 
 		if (validateChat(chat)) {
@@ -513,7 +513,7 @@ export const convertOpenAIChats = (_chats) => {
 	return chats;
 };
 
-export const isValidHttpUrl = (string) => {
+export const isValidHttpUrl = (string: string) => {
 	let url;
 
 	try {
@@ -525,7 +525,7 @@ export const isValidHttpUrl = (string) => {
 	return url.protocol === 'http:' || url.protocol === 'https:';
 };
 
-export const removeEmojis = (str) => {
+export const removeEmojis = (str: string) => {
 	// Regular expression to match emojis
 	const emojiRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;
 
@@ -533,20 +533,24 @@ export const removeEmojis = (str) => {
 	return str.replace(emojiRegex, '');
 };
 
-export const removeFormattings = (str) => {
+export const removeFormattings = (str: string) => {
 	return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, '');
 };
 
-export const extractSentences = (text) => {
-	// This regular expression matches code blocks marked by triple backticks
-	const codeBlockRegex = /```[\s\S]*?```/g;
+export const prepareTextForTTS = (content: string) => {
+	return removeFormattings(removeEmojis(content.trim()));
+};
 
-	let codeBlocks = [];
+// This regular expression matches code blocks marked by triple backticks
+const codeBlockRegex = /```[\s\S]*?```/g;
+
+export const extractSentences = (text: string) => {
+	const codeBlocks: string[] = [];
 	let index = 0;
 
 	// Temporarily replace code blocks with placeholders and store the blocks separately
 	text = text.replace(codeBlockRegex, (match) => {
-		let placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
 		codeBlocks[index++] = match;
 		return placeholder;
 	});
@@ -561,11 +565,36 @@ export const extractSentences = (text) => {
 	});
 
 	return sentences
-		.map((sentence) => removeFormattings(removeEmojis(sentence.trim())))
-		.filter((sentence) => sentence);
+		.map(prepareTextForTTS)
+		.filter(Boolean);
 };
 
-export const extractSentencesForAudio = (text) => {
+export const extractParagraphsForAudio = (text: string) => {
+	const codeBlocks: string[] = [];
+	let index = 0;
+
+	// Temporarily replace code blocks with placeholders and store the blocks separately
+	text = text.replace(codeBlockRegex, (match) => {
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		codeBlocks[index++] = match;
+		return placeholder;
+	});
+
+	// Split the modified text into paragraphs based on newlines, avoiding these blocks
+	let paragraphs = text.split(/\n+/);
+
+	// Restore code blocks and process paragraphs
+	paragraphs = paragraphs.map((paragraph) => {
+		// Check if the paragraph includes a placeholder for a code block
+		return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
+	});
+
+	return paragraphs
+		.map(prepareTextForTTS)
+		.filter(Boolean);
+};
+
+export const extractSentencesForAudio = (text: string) => {
 	return extractSentences(text).reduce((mergedTexts, currentText) => {
 		const lastIndex = mergedTexts.length - 1;
 		if (lastIndex >= 0) {
@@ -580,7 +609,7 @@ export const extractSentencesForAudio = (text) => {
 			mergedTexts.push(currentText);
 		}
 		return mergedTexts;
-	}, []);
+	}, [] as string[]);
 };
 
 export const blobToFile = (blob, fileName) => {