refac

2024-11-17 14:02:51 +00:00 · 2024-06-13 01:28:15 -07:00 · 2024-06-13 01:28:15 -07:00 · 5300d2c531
commit 5300d2c531
parent d6fd2a8228
3 changed files with 362 additions and 345 deletions
--- a/backend/main.py
+++ b/backend/main.py
@ -887,7 +887,7 @@ async def generate_emoji(form_data: dict, user=Depends(get_verified_user)):
    model = app.state.MODELS[model_id]

    template = '''
-You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please avoid using generic or overly ambiguous emojis like "🤔", and instead, choose ones that vividly represent the speaker's mood or reaction.
+You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please choose ones that vividly represent the speaker's mood or reaction.

 Message: """{{prompt}}"""
 '''
--- a/src/lib/components/chat/Chat.svelte
+++ b/src/lib/components/chat/Chat.svelte
@ -1209,6 +1209,7 @@

 <CallOverlay
 	{submitPrompt}
+	{stopResponse}
 	bind:files
 	modelId={selectedModelIds?.at(0) ?? null}
 	chatId={$chatId}
--- a/src/lib/components/chat/MessageInput/CallOverlay.svelte
+++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte
@ -14,16 +14,18 @@
 	const i18n = getContext('i18n');

 	export let eventTarget: EventTarget;
+
 	export let submitPrompt: Function;
+	export let stopResponse: Function;
+
 	export let files;

 	export let chatId;
 	export let modelId;

-	let message = '';
-
 	let loading = false;
 	let confirmed = false;
+	let interrupted = false;

 	let emoji = null;

@ -31,245 +33,16 @@
 	let cameraStream = null;

 	let assistantSpeaking = false;
-	let assistantAudio = {};
-	let assistantAudioIdx = null;

-	let rmsLevel = 0;
-	let hasStartedSpeaking = false;
+	let chatStreaming = false;
+	let assistantMessage = '';
+	let assistantSentences = [];
+	let assistantSentenceAudios = {};
+	let assistantSentenceIdx = -1;

-	let currentUtterance = null;
+	let audioQueue = [];

-	let mediaRecorder;
-	let audioChunks = [];
-
-	const MIN_DECIBELS = -45;
-	const VISUALIZER_BUFFER_LENGTH = 300;
-
-	// Function to calculate the RMS level from time domain data
-	const calculateRMS = (data: Uint8Array) => {
-		let sumSquares = 0;
-		for (let i = 0; i < data.length; i++) {
-			const normalizedValue = (data[i] - 128) / 128; // Normalize the data
-			sumSquares += normalizedValue * normalizedValue;
-		}
-		return Math.sqrt(sumSquares / data.length);
-	};
-
-	const normalizeRMS = (rms) => {
-		rms = rms * 10;
-		const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
-		const scaledRMS = Math.pow(rms, exp);
-
-		// Scale between 0.01 (1%) and 1.0 (100%)
-		return Math.min(1.0, Math.max(0.01, scaledRMS));
-	};
-
-	const analyseAudio = (stream) => {
-		const audioContext = new AudioContext();
-		const audioStreamSource = audioContext.createMediaStreamSource(stream);
-
-		const analyser = audioContext.createAnalyser();
-		analyser.minDecibels = MIN_DECIBELS;
-		audioStreamSource.connect(analyser);
-
-		const bufferLength = analyser.frequencyBinCount;
-
-		const domainData = new Uint8Array(bufferLength);
-		const timeDomainData = new Uint8Array(analyser.fftSize);
-
-		let lastSoundTime = Date.now();
-		hasStartedSpeaking = false;
-
-		const detectSound = () => {
-			const processFrame = () => {
-				if (!mediaRecorder || !$showCallOverlay) {
-					if (mediaRecorder) {
-						mediaRecorder.stop();
-					}
-
-					return;
-				}
-				analyser.getByteTimeDomainData(timeDomainData);
-				analyser.getByteFrequencyData(domainData);
-
-				// Calculate RMS level from time domain data
-				rmsLevel = calculateRMS(timeDomainData);
-
-				// Check if initial speech/noise has started
-				const hasSound = domainData.some((value) => value > 0);
-				if (hasSound) {
-					stopAllAudio();
-					hasStartedSpeaking = true;
-					lastSoundTime = Date.now();
-				}
-
-				// Start silence detection only after initial speech/noise has been detected
-				if (hasStartedSpeaking) {
-					if (Date.now() - lastSoundTime > 2000) {
-						confirmed = true;
-
-						if (mediaRecorder) {
-							mediaRecorder.stop();
-						}
-					}
-				}
-
-				window.requestAnimationFrame(processFrame);
-			};
-
-			window.requestAnimationFrame(processFrame);
-		};
-
-		detectSound();
-	};
-
-	const stopAllAudio = () => {
-		if (currentUtterance) {
-			speechSynthesis.cancel();
-			currentUtterance = null;
-		}
-		if (assistantAudio[assistantAudioIdx]) {
-			assistantAudio[assistantAudioIdx].pause();
-			assistantAudio[assistantAudioIdx].currentTime = 0;
-		}
-
-		const audioElement = document.getElementById('audioElement');
-
-		if (audioElement) {
-			audioElement.pause();
-			audioElement.currentTime = 0;
-		}
-		assistantSpeaking = false;
-	};
-
-	const playAudio = (idx) => {
-		if ($showCallOverlay) {
-			return new Promise((res) => {
-				assistantAudioIdx = idx;
-				const audioElement = document.getElementById('audioElement');
-				const audio = assistantAudio[idx];
-
-				if (audioElement) {
-					audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
-
-					audioElement.muted = true;
-
-					audioElement
-						.play()
-						.then(() => {
-							audioElement.muted = false;
-						})
-						.catch((error) => {
-							toast.error(error);
-						});
-
-					audioElement.onended = async (e) => {
-						await new Promise((r) => setTimeout(r, 300));
-
-						if (Object.keys(assistantAudio).length - 1 === idx) {
-							assistantSpeaking = false;
-						}
-
-						res(e);
-					};
-				}
-			});
-		} else {
-			return Promise.resolve();
-		}
-	};
-
-	const getOpenAISpeech = async (text) => {
-		const res = await synthesizeOpenAISpeech(
-			localStorage.token,
-			$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-			text
-		).catch((error) => {
-			toast.error(error);
-			assistantSpeaking = false;
-			return null;
-		});
-
-		if (res) {
-			const blob = await res.blob();
-			const blobUrl = URL.createObjectURL(blob);
-			const audio = new Audio(blobUrl);
-			assistantAudio = audio;
-		}
-	};
-
-	const transcribeHandler = async (audioBlob) => {
-		// Create a blob from the audio chunks
-
-		await tick();
-		const file = blobToFile(audioBlob, 'recording.wav');
-
-		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
-			toast.error(error);
-			return null;
-		});
-
-		if (res) {
-			console.log(res.text);
-
-			if (res.text !== '') {
-				const _responses = await submitPrompt(res.text, { _raw: true });
-				console.log(_responses);
-			}
-		}
-	};
-
-	const assistantSpeakingHandler = async (content) => {
-		assistantSpeaking = true;
-
-		if (modelId && ($settings?.showEmojiInCall ?? false)) {
-			console.log('Generating emoji');
-			const res = await generateEmoji(localStorage.token, modelId, content, chatId).catch(
-				(error) => {
-					console.error(error);
-					return null;
-				}
-			);
-
-			if (res) {
-				console.log(res);
-				if (/\p{Extended_Pictographic}/u.test(res)) {
-					emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
-				}
-			}
-		}
-
-		if (($config.audio.tts.engine ?? '') == '') {
-			let voices = [];
-			const getVoicesLoop = setInterval(async () => {
-				voices = await speechSynthesis.getVoices();
-				if (voices.length > 0) {
-					clearInterval(getVoicesLoop);
-
-					const voice =
-						voices
-							?.filter(
-								(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
-							)
-							?.at(0) ?? undefined;
-
-					currentUtterance = new SpeechSynthesisUtterance(content);
-
-					if (voice) {
-						currentUtterance.voice = voice;
-					}
-
-					speechSynthesis.speak(currentUtterance);
-
-					currentUtterance.onend = async () => {
-						assistantSpeaking = false;
-					};
-				}
-			}, 100);
-		} else if ($config.audio.tts.engine === 'openai') {
-			console.log('openai');
-
-			const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
+	$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
 		const lastIndex = mergedTexts.length - 1;
 		if (lastIndex >= 0) {
 			const previousText = mergedTexts[lastIndex];
@ -285,94 +58,14 @@
 		return mergedTexts;
 	}, []);

-			console.log(sentences);
+	let currentUtterance = null;

-			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
+	let rmsLevel = 0;
+	let hasStartedSpeaking = false;
+	let mediaRecorder;
+	let audioChunks = [];

-			for (const [idx, sentence] of sentences.entries()) {
-				const res = await synthesizeOpenAISpeech(
-					localStorage.token,
-					$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
-					sentence
-				).catch((error) => {
-					toast.error(error);
-
-					assistantSpeaking = false;
-					return null;
-				});
-
-				if (res) {
-					const blob = await res.blob();
-					const blobUrl = URL.createObjectURL(blob);
-					const audio = new Audio(blobUrl);
-					assistantAudio[idx] = audio;
-					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
-
-					if (idx === sentences.length - 1) {
-						lastPlayedAudioPromise.then(() => {
-							assistantSpeaking = false;
-						});
-					}
-				}
-			}
-		}
-	};
-
-	const stopRecordingCallback = async (_continue = true) => {
-		if ($showCallOverlay) {
-			if (confirmed) {
-				loading = true;
-				emoji = null;
-
-				if (cameraStream) {
-					const imageUrl = takeScreenshot();
-
-					files = [
-						{
-							type: 'image',
-							url: imageUrl
-						}
-					];
-				}
-
-				const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
-				await transcribeHandler(audioBlob);
-
-				confirmed = false;
-				loading = false;
-			}
-			audioChunks = [];
-			mediaRecorder = false;
-
-			if (_continue) {
-				startRecording();
-			}
-		} else {
-			audioChunks = [];
-			mediaRecorder = false;
-		}
-	};
-
-	const startRecording = async () => {
-		const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-		mediaRecorder = new MediaRecorder(stream);
-		mediaRecorder.onstart = () => {
-			console.log('Recording started');
-			audioChunks = [];
-			analyseAudio(stream);
-		};
-		mediaRecorder.ondataavailable = (event) => {
-			if (hasStartedSpeaking) {
-				audioChunks.push(event.data);
-			}
-		};
-		mediaRecorder.onstop = async () => {
-			console.log('Recording stopped');
-
-			await stopRecordingCallback();
-		};
-		mediaRecorder.start();
-	};
+	$: console.log('hasStartedSpeaking', hasStartedSpeaking);

 	let videoInputDevices = [];
 	let selectedVideoInputDeviceId = null;
@ -475,6 +168,286 @@
 		camera = false;
 	};

+	const MIN_DECIBELS = -45;
+	const VISUALIZER_BUFFER_LENGTH = 300;
+
+	// Function to calculate the RMS level from time domain data
+	const calculateRMS = (data: Uint8Array) => {
+		let sumSquares = 0;
+		for (let i = 0; i < data.length; i++) {
+			const normalizedValue = (data[i] - 128) / 128; // Normalize the data
+			sumSquares += normalizedValue * normalizedValue;
+		}
+		return Math.sqrt(sumSquares / data.length);
+	};
+
+	const analyseAudio = (stream) => {
+		const audioContext = new AudioContext();
+		const audioStreamSource = audioContext.createMediaStreamSource(stream);
+
+		const analyser = audioContext.createAnalyser();
+		analyser.minDecibels = MIN_DECIBELS;
+		audioStreamSource.connect(analyser);
+
+		const bufferLength = analyser.frequencyBinCount;
+
+		const domainData = new Uint8Array(bufferLength);
+		const timeDomainData = new Uint8Array(analyser.fftSize);
+
+		let lastSoundTime = Date.now();
+		hasStartedSpeaking = false;
+
+		const detectSound = () => {
+			const processFrame = () => {
+				if (!mediaRecorder || !$showCallOverlay) {
+					return;
+				}
+
+				analyser.getByteTimeDomainData(timeDomainData);
+				analyser.getByteFrequencyData(domainData);
+
+				// Calculate RMS level from time domain data
+				rmsLevel = calculateRMS(timeDomainData);
+
+				// Check if initial speech/noise has started
+				const hasSound = domainData.some((value) => value > 0);
+				if (hasSound) {
+					hasStartedSpeaking = true;
+					lastSoundTime = Date.now();
+
+					// BIG RED TEXT
+					console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
+					stopAllAudio();
+				}
+
+				// Start silence detection only after initial speech/noise has been detected
+				if (hasStartedSpeaking) {
+					if (Date.now() - lastSoundTime > 2000) {
+						confirmed = true;
+
+						if (mediaRecorder) {
+							mediaRecorder.stop();
+						}
+					}
+				}
+
+				window.requestAnimationFrame(processFrame);
+			};
+
+			window.requestAnimationFrame(processFrame);
+		};
+
+		detectSound();
+	};
+
+	const transcribeHandler = async (audioBlob) => {
+		// Create a blob from the audio chunks
+
+		await tick();
+		const file = blobToFile(audioBlob, 'recording.wav');
+
+		const res = await transcribeAudio(localStorage.token, file).catch((error) => {
+			toast.error(error);
+			return null;
+		});
+
+		if (res) {
+			console.log(res.text);
+
+			if (res.text !== '') {
+				const _responses = await submitPrompt(res.text, { _raw: true });
+				console.log(_responses);
+			}
+		}
+	};
+
+	const stopAllAudio = async () => {
+		interrupted = true;
+
+		if (chatStreaming) {
+			stopResponse();
+		}
+
+		if (currentUtterance) {
+			speechSynthesis.cancel();
+			currentUtterance = null;
+		}
+
+		await tick();
+		audioQueue = [];
+		await tick();
+
+		const audioElement = document.getElementById('audioElement');
+		if (audioElement) {
+			audioElement.pause();
+			audioElement.currentTime = 0;
+		}
+
+		assistantSpeaking = false;
+	};
+
+	const speakSpeechSynthesisHandler = (content) => {
+		if ($showCallOverlay) {
+			return new Promise((resolve) => {
+				let voices = [];
+				const getVoicesLoop = setInterval(async () => {
+					voices = await speechSynthesis.getVoices();
+					if (voices.length > 0) {
+						clearInterval(getVoicesLoop);
+
+						const voice =
+							voices
+								?.filter(
+									(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+								)
+								?.at(0) ?? undefined;
+
+						currentUtterance = new SpeechSynthesisUtterance(content);
+
+						if (voice) {
+							currentUtterance.voice = voice;
+						}
+
+						speechSynthesis.speak(currentUtterance);
+						currentUtterance.onend = async (e) => {
+							await new Promise((r) => setTimeout(r, 100));
+							resolve(e);
+						};
+					}
+				}, 100);
+			});
+		} else {
+			return Promise.resolve();
+		}
+	};
+
+	const playAudio = (audio) => {
+		if ($showCallOverlay) {
+			return new Promise((resolve) => {
+				const audioElement = document.getElementById('audioElement');
+
+				if (audioElement) {
+					audioElement.src = audio.src;
+					audioElement.muted = true;
+
+					audioElement
+						.play()
+						.then(() => {
+							audioElement.muted = false;
+						})
+						.catch((error) => {
+							console.error(error);
+						});
+
+					audioElement.onended = async (e) => {
+						await new Promise((r) => setTimeout(r, 100));
+						resolve(e);
+					};
+				}
+			});
+		} else {
+			return Promise.resolve();
+		}
+	};
+
+	const playAudioHandler = async () => {
+		console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
+		if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
+			assistantSpeaking = true;
+			const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
+			audioQueue = audioQueue;
+			await playAudio(audioToPlay);
+			assistantSpeaking = false;
+		}
+	};
+
+	const setContentAudio = async (content, idx) => {
+		if (assistantSentenceAudios[idx] === undefined) {
+			console.log('%c%s', 'color: red; font-size: 20px;', content);
+
+			assistantSentenceAudios[idx] = null;
+			const res = await synthesizeOpenAISpeech(
+				localStorage.token,
+				$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
+				content
+			).catch((error) => {
+				toast.error(error);
+				assistantSpeaking = false;
+				return null;
+			});
+
+			if (res) {
+				const blob = await res.blob();
+				const blobUrl = URL.createObjectURL(blob);
+				const audio = new Audio(blobUrl);
+				assistantSentenceAudios[idx] = audio;
+				audioQueue.push(audio);
+				audioQueue = audioQueue;
+			}
+		}
+	};
+
+	const stopRecordingCallback = async (_continue = true) => {
+		console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
+
+		if ($showCallOverlay) {
+			// deep copy the audioChunks array
+			const _audioChunks = audioChunks.slice(0);
+
+			audioChunks = [];
+			mediaRecorder = false;
+
+			if (_continue) {
+				startRecording();
+			}
+
+			if (confirmed) {
+				loading = true;
+				emoji = null;
+
+				if (cameraStream) {
+					const imageUrl = takeScreenshot();
+
+					files = [
+						{
+							type: 'image',
+							url: imageUrl
+						}
+					];
+				}
+
+				const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
+				await transcribeHandler(audioBlob);
+
+				confirmed = false;
+				loading = false;
+			}
+		} else {
+			audioChunks = [];
+			mediaRecorder = false;
+		}
+	};
+
+	const startRecording = async () => {
+		const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+		mediaRecorder = new MediaRecorder(stream);
+		mediaRecorder.onstart = () => {
+			console.log('Recording started');
+			audioChunks = [];
+			analyseAudio(stream);
+		};
+		mediaRecorder.ondataavailable = (event) => {
+			if (hasStartedSpeaking) {
+				audioChunks.push(event.data);
+			}
+		};
+		mediaRecorder.onstop = async () => {
+			console.log('Recording stopped');
+			await stopRecordingCallback();
+		};
+		mediaRecorder.start();
+	};
+
 	$: if ($showCallOverlay) {
 		startRecording();
 	} else {
@ -483,30 +456,73 @@
 		stopRecordingCallback(false);
 	}

+	$: {
+		if (audioQueue.length > 0 && !assistantSpeaking) {
+			playAudioHandler();
+		}
+	}
+
 	onMount(() => {
 		console.log(eventTarget);

 		eventTarget.addEventListener('chat:start', async (e) => {
-			console.log('Chat start event:', e.detail);
-			message = '';
+			console.log('Chat start event:', e);
+			interrupted = false;
+
+			assistantMessage = '';
+			assistantSentenceIdx = -1;
+			assistantSentenceAudios = {}; // Reset audio tracking
+			audioQueue = []; // Clear the audio queue
+
+			chatStreaming = true;
 		});

 		eventTarget.addEventListener('chat', async (e) => {
 			const { content } = e.detail;
+			assistantMessage += content;
+			await tick();

-			message += content;
-			console.log('Chat event:', message);
+			if (!interrupted) {
+				if ($config.audio.tts.engine !== '') {
+					assistantSentenceIdx = assistantSentences.length - 2;
+
+					if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
+						await tick();
+						setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
+					}
+				}
+			}
+
+			chatStreaming = true;
 		});

 		eventTarget.addEventListener('chat:finish', async (e) => {
-			console.log('Chat finish event:', e.detail);
-			message = '';
+			chatStreaming = false;
+			loading = false;
+
+			console.log('Chat finish event:', e);
+			await tick();
+
+			if (!interrupted) {
+				if ($config.audio.tts.engine !== '') {
+					for (const [idx, sentence] of assistantSentences.entries()) {
+						if (!assistantSentenceAudios[idx]) {
+							await tick();
+							setContentAudio(sentence, idx);
+						}
+					}
+				} else {
+					emoji = generateEmoji(localStorage.token, modelId, assistantMessage);
+					speakSpeechSynthesisHandler(assistantMessage);
+				}
+			}
 		});
 	});
 </script>

+<audio id="audioElement" src="" style="display: none;" />
+
 {#if $showCallOverlay}
-	<audio id="audioElement" src="" style="display: none;" />
 	<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
 		<div
 			class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"