feat: Add control for how message content is split for TTS generation reqs

2024-08-24 20:35:42 -04:00
parent f30428754f
commit 3967c34261
8 changed files with 277 additions and 157 deletions
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -408,7 +408,7 @@ const convertOpenAIMessages = (convo) => {
 	let currentId = '';
 	let lastId = null;

-	for (let message_id in mapping) {
+	for (const message_id in mapping) {
 		const message = mapping[message_id];
 		currentId = message_id;
 		try {
@@ -442,7 +442,7 @@ const convertOpenAIMessages = (convo) => {
 		}
 	}

-	let history = {};
+	const history: Record<PropertyKey, (typeof messages)[number]> = {};
 	messages.forEach((obj) => (history[obj.id] = obj));

 	const chat = {
@@ -481,7 +481,7 @@ const validateChat = (chat) => {
 	}

 	// Every message's content should be a string
-	for (let message of messages) {
+	for (const message of messages) {
 		if (typeof message.content !== 'string') {
 			return false;
 		}
@@ -494,7 +494,7 @@ export const convertOpenAIChats = (_chats) => {
 	// Create a list of dictionaries with each conversation from import
 	const chats = [];
 	let failed = 0;
-	for (let convo of _chats) {
+	for (const convo of _chats) {
 		const chat = convertOpenAIMessages(convo);

 		if (validateChat(chat)) {
@@ -513,7 +513,7 @@ export const convertOpenAIChats = (_chats) => {
 	return chats;
 };

-export const isValidHttpUrl = (string) => {
+export const isValidHttpUrl = (string: string) => {
 	let url;

 	try {
@@ -525,7 +525,7 @@ export const isValidHttpUrl = (string) => {
 	return url.protocol === 'http:' || url.protocol === 'https:';
 };

-export const removeEmojis = (str) => {
+export const removeEmojis = (str: string) => {
 	// Regular expression to match emojis
 	const emojiRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;

@@ -533,20 +533,24 @@ export const removeEmojis = (str) => {
 	return str.replace(emojiRegex, '');
 };

-export const removeFormattings = (str) => {
+export const removeFormattings = (str: string) => {
 	return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, '');
 };

-export const extractSentences = (text) => {
-	// This regular expression matches code blocks marked by triple backticks
-	const codeBlockRegex = /```[\s\S]*?```/g;
+export const prepareTextForTTS = (content: string) => {
+	return removeFormattings(removeEmojis(content.trim()));
+};

-	let codeBlocks = [];
+// This regular expression matches code blocks marked by triple backticks
+const codeBlockRegex = /```[\s\S]*?```/g;
+
+export const extractSentences = (text: string) => {
+	const codeBlocks: string[] = [];
 	let index = 0;

 	// Temporarily replace code blocks with placeholders and store the blocks separately
 	text = text.replace(codeBlockRegex, (match) => {
-		let placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
 		codeBlocks[index++] = match;
 		return placeholder;
 	});
@@ -561,11 +565,36 @@ export const extractSentences = (text) => {
 	});

 	return sentences
-		.map((sentence) => removeFormattings(removeEmojis(sentence.trim())))
-		.filter((sentence) => sentence);
+		.map(prepareTextForTTS)
+		.filter(Boolean);
 };

-export const extractSentencesForAudio = (text) => {
+export const extractParagraphsForAudio = (text: string) => {
+	const codeBlocks: string[] = [];
+	let index = 0;
+
+	// Temporarily replace code blocks with placeholders and store the blocks separately
+	text = text.replace(codeBlockRegex, (match) => {
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		codeBlocks[index++] = match;
+		return placeholder;
+	});
+
+	// Split the modified text into paragraphs based on newlines, avoiding these blocks
+	let paragraphs = text.split(/\n+/);
+
+	// Restore code blocks and process paragraphs
+	paragraphs = paragraphs.map((paragraph) => {
+		// Check if the paragraph includes a placeholder for a code block
+		return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
+	});
+
+	return paragraphs
+		.map(prepareTextForTTS)
+		.filter(Boolean);
+};
+
+export const extractSentencesForAudio = (text: string) => {
 	return extractSentences(text).reduce((mergedTexts, currentText) => {
 		const lastIndex = mergedTexts.length - 1;
 		if (lastIndex >= 0) {
@@ -580,7 +609,7 @@ export const extractSentencesForAudio = (text) => {
 			mergedTexts.push(currentText);
 		}
 		return mergedTexts;
-	}, []);
+	}, [] as string[]);
 };

 export const blobToFile = (blob, fileName) => {