diff --git a/backend/main.py b/backend/main.py index 9de4d7111..235ed421e 100644 --- a/backend/main.py +++ b/backend/main.py @@ -887,7 +887,7 @@ async def generate_emoji(form_data: dict, user=Depends(get_verified_user)): model = app.state.MODELS[model_id] template = ''' -You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please avoid using generic or overly ambiguous emojis like "🤔", and instead, choose ones that vividly represent the speaker's mood or reaction. +You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please choose ones that vividly represent the speaker's mood or reaction. Message: """{{prompt}}""" ''' diff --git a/src/lib/components/chat/Chat.svelte b/src/lib/components/chat/Chat.svelte index aa1462ff4..44a221ba6 100644 --- a/src/lib/components/chat/Chat.svelte +++ b/src/lib/components/chat/Chat.svelte @@ -1209,6 +1209,7 @@ { + const lastIndex = mergedTexts.length - 1; + if (lastIndex >= 0) { + const previousText = mergedTexts[lastIndex]; + const wordCount = previousText.split(/\s+/).length; + if (wordCount < 2) { + mergedTexts[lastIndex] = previousText + ' ' + currentText; + } else { + mergedTexts.push(currentText); + } + } else { + mergedTexts.push(currentText); + } + return mergedTexts; + }, []); let currentUtterance = null; + let rmsLevel = 0; + let hasStartedSpeaking = false; let mediaRecorder; let audioChunks = []; - const MIN_DECIBELS = -45; - const VISUALIZER_BUFFER_LENGTH = 300; - - // Function to calculate the RMS level from time domain data - const calculateRMS = (data: Uint8Array) => { - let sumSquares = 0; - for (let i = 0; i < data.length; i++) { - const normalizedValue = (data[i] - 128) / 128; // Normalize the data - sumSquares += normalizedValue * normalizedValue; - } - return Math.sqrt(sumSquares / data.length); - }; - - const normalizeRMS = (rms) => { - rms = rms * 10; - const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more - const scaledRMS = Math.pow(rms, exp); - - // Scale between 0.01 (1%) and 1.0 (100%) - return Math.min(1.0, Math.max(0.01, scaledRMS)); - }; - - const analyseAudio = (stream) => { - const audioContext = new AudioContext(); - const audioStreamSource = audioContext.createMediaStreamSource(stream); - - const analyser = audioContext.createAnalyser(); - analyser.minDecibels = MIN_DECIBELS; - audioStreamSource.connect(analyser); - - const bufferLength = analyser.frequencyBinCount; - - const domainData = new Uint8Array(bufferLength); - const timeDomainData = new Uint8Array(analyser.fftSize); - - let lastSoundTime = Date.now(); - hasStartedSpeaking = false; - - const detectSound = () => { - const processFrame = () => { - if (!mediaRecorder || !$showCallOverlay) { - if (mediaRecorder) { - mediaRecorder.stop(); - } - - return; - } - analyser.getByteTimeDomainData(timeDomainData); - analyser.getByteFrequencyData(domainData); - - // Calculate RMS level from time domain data - rmsLevel = calculateRMS(timeDomainData); - - // Check if initial speech/noise has started - const hasSound = domainData.some((value) => value > 0); - if (hasSound) { - stopAllAudio(); - hasStartedSpeaking = true; - lastSoundTime = Date.now(); - } - - // Start silence detection only after initial speech/noise has been detected - if (hasStartedSpeaking) { - if (Date.now() - lastSoundTime > 2000) { - confirmed = true; - - if (mediaRecorder) { - mediaRecorder.stop(); - } - } - } - - window.requestAnimationFrame(processFrame); - }; - - window.requestAnimationFrame(processFrame); - }; - - detectSound(); - }; - - const stopAllAudio = () => { - if (currentUtterance) { - speechSynthesis.cancel(); - currentUtterance = null; - } - if (assistantAudio[assistantAudioIdx]) { - assistantAudio[assistantAudioIdx].pause(); - assistantAudio[assistantAudioIdx].currentTime = 0; - } - - const audioElement = document.getElementById('audioElement'); - - if (audioElement) { - audioElement.pause(); - audioElement.currentTime = 0; - } - assistantSpeaking = false; - }; - - const playAudio = (idx) => { - if ($showCallOverlay) { - return new Promise((res) => { - assistantAudioIdx = idx; - const audioElement = document.getElementById('audioElement'); - const audio = assistantAudio[idx]; - - if (audioElement) { - audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property - - audioElement.muted = true; - - audioElement - .play() - .then(() => { - audioElement.muted = false; - }) - .catch((error) => { - toast.error(error); - }); - - audioElement.onended = async (e) => { - await new Promise((r) => setTimeout(r, 300)); - - if (Object.keys(assistantAudio).length - 1 === idx) { - assistantSpeaking = false; - } - - res(e); - }; - } - }); - } else { - return Promise.resolve(); - } - }; - - const getOpenAISpeech = async (text) => { - const res = await synthesizeOpenAISpeech( - localStorage.token, - $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice, - text - ).catch((error) => { - toast.error(error); - assistantSpeaking = false; - return null; - }); - - if (res) { - const blob = await res.blob(); - const blobUrl = URL.createObjectURL(blob); - const audio = new Audio(blobUrl); - assistantAudio = audio; - } - }; - - const transcribeHandler = async (audioBlob) => { - // Create a blob from the audio chunks - - await tick(); - const file = blobToFile(audioBlob, 'recording.wav'); - - const res = await transcribeAudio(localStorage.token, file).catch((error) => { - toast.error(error); - return null; - }); - - if (res) { - console.log(res.text); - - if (res.text !== '') { - const _responses = await submitPrompt(res.text, { _raw: true }); - console.log(_responses); - } - } - }; - - const assistantSpeakingHandler = async (content) => { - assistantSpeaking = true; - - if (modelId && ($settings?.showEmojiInCall ?? false)) { - console.log('Generating emoji'); - const res = await generateEmoji(localStorage.token, modelId, content, chatId).catch( - (error) => { - console.error(error); - return null; - } - ); - - if (res) { - console.log(res); - if (/\p{Extended_Pictographic}/u.test(res)) { - emoji = res.match(/\p{Extended_Pictographic}/gu)[0]; - } - } - } - - if (($config.audio.tts.engine ?? '') == '') { - let voices = []; - const getVoicesLoop = setInterval(async () => { - voices = await speechSynthesis.getVoices(); - if (voices.length > 0) { - clearInterval(getVoicesLoop); - - const voice = - voices - ?.filter( - (v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) - ) - ?.at(0) ?? undefined; - - currentUtterance = new SpeechSynthesisUtterance(content); - - if (voice) { - currentUtterance.voice = voice; - } - - speechSynthesis.speak(currentUtterance); - - currentUtterance.onend = async () => { - assistantSpeaking = false; - }; - } - }, 100); - } else if ($config.audio.tts.engine === 'openai') { - console.log('openai'); - - const sentences = extractSentences(content).reduce((mergedTexts, currentText) => { - const lastIndex = mergedTexts.length - 1; - if (lastIndex >= 0) { - const previousText = mergedTexts[lastIndex]; - const wordCount = previousText.split(/\s+/).length; - if (wordCount < 2) { - mergedTexts[lastIndex] = previousText + ' ' + currentText; - } else { - mergedTexts.push(currentText); - } - } else { - mergedTexts.push(currentText); - } - return mergedTexts; - }, []); - - console.log(sentences); - - let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately - - for (const [idx, sentence] of sentences.entries()) { - const res = await synthesizeOpenAISpeech( - localStorage.token, - $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice, - sentence - ).catch((error) => { - toast.error(error); - - assistantSpeaking = false; - return null; - }); - - if (res) { - const blob = await res.blob(); - const blobUrl = URL.createObjectURL(blob); - const audio = new Audio(blobUrl); - assistantAudio[idx] = audio; - lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); - - if (idx === sentences.length - 1) { - lastPlayedAudioPromise.then(() => { - assistantSpeaking = false; - }); - } - } - } - } - }; - - const stopRecordingCallback = async (_continue = true) => { - if ($showCallOverlay) { - if (confirmed) { - loading = true; - emoji = null; - - if (cameraStream) { - const imageUrl = takeScreenshot(); - - files = [ - { - type: 'image', - url: imageUrl - } - ]; - } - - const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); - await transcribeHandler(audioBlob); - - confirmed = false; - loading = false; - } - audioChunks = []; - mediaRecorder = false; - - if (_continue) { - startRecording(); - } - } else { - audioChunks = []; - mediaRecorder = false; - } - }; - - const startRecording = async () => { - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - mediaRecorder = new MediaRecorder(stream); - mediaRecorder.onstart = () => { - console.log('Recording started'); - audioChunks = []; - analyseAudio(stream); - }; - mediaRecorder.ondataavailable = (event) => { - if (hasStartedSpeaking) { - audioChunks.push(event.data); - } - }; - mediaRecorder.onstop = async () => { - console.log('Recording stopped'); - - await stopRecordingCallback(); - }; - mediaRecorder.start(); - }; + $: console.log('hasStartedSpeaking', hasStartedSpeaking); let videoInputDevices = []; let selectedVideoInputDeviceId = null; @@ -475,6 +168,286 @@ camera = false; }; + const MIN_DECIBELS = -45; + const VISUALIZER_BUFFER_LENGTH = 300; + + // Function to calculate the RMS level from time domain data + const calculateRMS = (data: Uint8Array) => { + let sumSquares = 0; + for (let i = 0; i < data.length; i++) { + const normalizedValue = (data[i] - 128) / 128; // Normalize the data + sumSquares += normalizedValue * normalizedValue; + } + return Math.sqrt(sumSquares / data.length); + }; + + const analyseAudio = (stream) => { + const audioContext = new AudioContext(); + const audioStreamSource = audioContext.createMediaStreamSource(stream); + + const analyser = audioContext.createAnalyser(); + analyser.minDecibels = MIN_DECIBELS; + audioStreamSource.connect(analyser); + + const bufferLength = analyser.frequencyBinCount; + + const domainData = new Uint8Array(bufferLength); + const timeDomainData = new Uint8Array(analyser.fftSize); + + let lastSoundTime = Date.now(); + hasStartedSpeaking = false; + + const detectSound = () => { + const processFrame = () => { + if (!mediaRecorder || !$showCallOverlay) { + return; + } + + analyser.getByteTimeDomainData(timeDomainData); + analyser.getByteFrequencyData(domainData); + + // Calculate RMS level from time domain data + rmsLevel = calculateRMS(timeDomainData); + + // Check if initial speech/noise has started + const hasSound = domainData.some((value) => value > 0); + if (hasSound) { + hasStartedSpeaking = true; + lastSoundTime = Date.now(); + + // BIG RED TEXT + console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected'); + stopAllAudio(); + } + + // Start silence detection only after initial speech/noise has been detected + if (hasStartedSpeaking) { + if (Date.now() - lastSoundTime > 2000) { + confirmed = true; + + if (mediaRecorder) { + mediaRecorder.stop(); + } + } + } + + window.requestAnimationFrame(processFrame); + }; + + window.requestAnimationFrame(processFrame); + }; + + detectSound(); + }; + + const transcribeHandler = async (audioBlob) => { + // Create a blob from the audio chunks + + await tick(); + const file = blobToFile(audioBlob, 'recording.wav'); + + const res = await transcribeAudio(localStorage.token, file).catch((error) => { + toast.error(error); + return null; + }); + + if (res) { + console.log(res.text); + + if (res.text !== '') { + const _responses = await submitPrompt(res.text, { _raw: true }); + console.log(_responses); + } + } + }; + + const stopAllAudio = async () => { + interrupted = true; + + if (chatStreaming) { + stopResponse(); + } + + if (currentUtterance) { + speechSynthesis.cancel(); + currentUtterance = null; + } + + await tick(); + audioQueue = []; + await tick(); + + const audioElement = document.getElementById('audioElement'); + if (audioElement) { + audioElement.pause(); + audioElement.currentTime = 0; + } + + assistantSpeaking = false; + }; + + const speakSpeechSynthesisHandler = (content) => { + if ($showCallOverlay) { + return new Promise((resolve) => { + let voices = []; + const getVoicesLoop = setInterval(async () => { + voices = await speechSynthesis.getVoices(); + if (voices.length > 0) { + clearInterval(getVoicesLoop); + + const voice = + voices + ?.filter( + (v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) + ) + ?.at(0) ?? undefined; + + currentUtterance = new SpeechSynthesisUtterance(content); + + if (voice) { + currentUtterance.voice = voice; + } + + speechSynthesis.speak(currentUtterance); + currentUtterance.onend = async (e) => { + await new Promise((r) => setTimeout(r, 100)); + resolve(e); + }; + } + }, 100); + }); + } else { + return Promise.resolve(); + } + }; + + const playAudio = (audio) => { + if ($showCallOverlay) { + return new Promise((resolve) => { + const audioElement = document.getElementById('audioElement'); + + if (audioElement) { + audioElement.src = audio.src; + audioElement.muted = true; + + audioElement + .play() + .then(() => { + audioElement.muted = false; + }) + .catch((error) => { + console.error(error); + }); + + audioElement.onended = async (e) => { + await new Promise((r) => setTimeout(r, 100)); + resolve(e); + }; + } + }); + } else { + return Promise.resolve(); + } + }; + + const playAudioHandler = async () => { + console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0); + if (!assistantSpeaking && !interrupted && audioQueue.length > 0) { + assistantSpeaking = true; + const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing. + audioQueue = audioQueue; + await playAudio(audioToPlay); + assistantSpeaking = false; + } + }; + + const setContentAudio = async (content, idx) => { + if (assistantSentenceAudios[idx] === undefined) { + console.log('%c%s', 'color: red; font-size: 20px;', content); + + assistantSentenceAudios[idx] = null; + const res = await synthesizeOpenAISpeech( + localStorage.token, + $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice, + content + ).catch((error) => { + toast.error(error); + assistantSpeaking = false; + return null; + }); + + if (res) { + const blob = await res.blob(); + const blobUrl = URL.createObjectURL(blob); + const audio = new Audio(blobUrl); + assistantSentenceAudios[idx] = audio; + audioQueue.push(audio); + audioQueue = audioQueue; + } + } + }; + + const stopRecordingCallback = async (_continue = true) => { + console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨'); + + if ($showCallOverlay) { + // deep copy the audioChunks array + const _audioChunks = audioChunks.slice(0); + + audioChunks = []; + mediaRecorder = false; + + if (_continue) { + startRecording(); + } + + if (confirmed) { + loading = true; + emoji = null; + + if (cameraStream) { + const imageUrl = takeScreenshot(); + + files = [ + { + type: 'image', + url: imageUrl + } + ]; + } + + const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' }); + await transcribeHandler(audioBlob); + + confirmed = false; + loading = false; + } + } else { + audioChunks = []; + mediaRecorder = false; + } + }; + + const startRecording = async () => { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream); + mediaRecorder.onstart = () => { + console.log('Recording started'); + audioChunks = []; + analyseAudio(stream); + }; + mediaRecorder.ondataavailable = (event) => { + if (hasStartedSpeaking) { + audioChunks.push(event.data); + } + }; + mediaRecorder.onstop = async () => { + console.log('Recording stopped'); + await stopRecordingCallback(); + }; + mediaRecorder.start(); + }; + $: if ($showCallOverlay) { startRecording(); } else { @@ -483,30 +456,73 @@ stopRecordingCallback(false); } + $: { + if (audioQueue.length > 0 && !assistantSpeaking) { + playAudioHandler(); + } + } + onMount(() => { console.log(eventTarget); eventTarget.addEventListener('chat:start', async (e) => { - console.log('Chat start event:', e.detail); - message = ''; + console.log('Chat start event:', e); + interrupted = false; + + assistantMessage = ''; + assistantSentenceIdx = -1; + assistantSentenceAudios = {}; // Reset audio tracking + audioQueue = []; // Clear the audio queue + + chatStreaming = true; }); eventTarget.addEventListener('chat', async (e) => { const { content } = e.detail; + assistantMessage += content; + await tick(); - message += content; - console.log('Chat event:', message); + if (!interrupted) { + if ($config.audio.tts.engine !== '') { + assistantSentenceIdx = assistantSentences.length - 2; + + if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) { + await tick(); + setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx); + } + } + } + + chatStreaming = true; }); eventTarget.addEventListener('chat:finish', async (e) => { - console.log('Chat finish event:', e.detail); - message = ''; + chatStreaming = false; + loading = false; + + console.log('Chat finish event:', e); + await tick(); + + if (!interrupted) { + if ($config.audio.tts.engine !== '') { + for (const [idx, sentence] of assistantSentences.entries()) { + if (!assistantSentenceAudios[idx]) { + await tick(); + setContentAudio(sentence, idx); + } + } + } else { + emoji = generateEmoji(localStorage.token, modelId, assistantMessage); + speakSpeechSynthesisHandler(assistantMessage); + } + } }); }); +