diff --git a/backend/main.py b/backend/main.py index 235ed421e..e42c4ed9c 100644 --- a/backend/main.py +++ b/backend/main.py @@ -887,7 +887,7 @@ async def generate_emoji(form_data: dict, user=Depends(get_verified_user)): model = app.state.MODELS[model_id] template = ''' -You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please choose ones that vividly represent the speaker's mood or reaction. +Your task is to reflect the speaker's likely facial expression through a fitting emoji. Interpret emotions from the message and reflect their facial expression using fitting, diverse emojis (e.g., 😊, 😢, 😡, 😱). Message: """{{prompt}}""" ''' diff --git a/src/lib/components/chat/MessageInput/CallOverlay.svelte b/src/lib/components/chat/MessageInput/CallOverlay.svelte index 43b4f1308..615c896b2 100644 --- a/src/lib/components/chat/MessageInput/CallOverlay.svelte +++ b/src/lib/components/chat/MessageInput/CallOverlay.svelte @@ -41,6 +41,7 @@ let assistantSentenceIdx = -1; let audioQueue = []; + let emojiQueue = []; $: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => { const lastIndex = mergedTexts.length - 1; @@ -65,8 +66,6 @@ let mediaRecorder; let audioChunks = []; - $: console.log('hasStartedSpeaking', hasStartedSpeaking); - let videoInputDevices = []; let selectedVideoInputDeviceId = null; @@ -274,6 +273,7 @@ } await tick(); + emojiQueue = []; audioQueue = []; await tick(); @@ -354,6 +354,14 @@ console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0); if (!assistantSpeaking && !interrupted && audioQueue.length > 0) { assistantSpeaking = true; + + if ($settings?.showEmojiInCall ?? false) { + if (emojiQueue.length > 0) { + emoji = emojiQueue.shift(); + emojiQueue = emojiQueue; + } + } + const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing. audioQueue = audioQueue; await playAudio(audioToPlay); @@ -363,9 +371,39 @@ const setContentAudio = async (content, idx) => { if (assistantSentenceAudios[idx] === undefined) { - console.log('%c%s', 'color: red; font-size: 20px;', content); + // Wait for the previous audio to be loaded + if (idx > 0) { + await new Promise((resolve) => { + const check = setInterval(() => { + if ( + assistantSentenceAudios[idx - 1] !== undefined && + assistantSentenceAudios[idx - 1] !== null + ) { + clearInterval(check); + resolve(); + } + }, 100); + }); + } assistantSentenceAudios[idx] = null; + + if ($settings?.showEmojiInCall ?? false) { + const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content); + + if (sentenceEmoji) { + // Big red text with content and emoji + console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`); + + if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) { + emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]); + emojiQueue = emojiQueue; + } + } + + await tick(); + } + const res = await synthesizeOpenAISpeech( localStorage.token, $settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice, @@ -381,6 +419,9 @@ const blobUrl = URL.createObjectURL(blob); const audio = new Audio(blobUrl); assistantSentenceAudios[idx] = audio; + + console.log('%c%s', 'color: red; font-size: 20px;', content); + audioQueue.push(audio); audioQueue = audioQueue; } @@ -388,9 +429,9 @@ }; const stopRecordingCallback = async (_continue = true) => { - console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨'); - if ($showCallOverlay) { + console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨'); + // deep copy the audioChunks array const _audioChunks = audioChunks.slice(0); @@ -448,13 +489,31 @@ mediaRecorder.start(); }; - $: if ($showCallOverlay) { - startRecording(); - } else { - stopCamera(); - stopAllAudio(); - stopRecordingCallback(false); - } + const resetAssistantMessage = async () => { + interrupted = false; + + assistantMessage = ''; + assistantSentenceIdx = -1; + assistantSentenceAudios = {}; // Reset audio tracking + audioQueue = []; // Clear the audio queue + audioQueue = audioQueue; + + emoji = null; + emojiQueue = []; + emojiQueue = emojiQueue; + }; + + $: (async () => { + if ($showCallOverlay) { + await resetAssistantMessage(); + await tick(); + startRecording(); + } else { + stopCamera(); + stopAllAudio(); + stopRecordingCallback(false); + } + })(); $: { if (audioQueue.length > 0 && !assistantSpeaking) { @@ -463,57 +522,66 @@ } onMount(() => { - console.log(eventTarget); - eventTarget.addEventListener('chat:start', async (e) => { - console.log('Chat start event:', e); - interrupted = false; - - assistantMessage = ''; - assistantSentenceIdx = -1; - assistantSentenceAudios = {}; // Reset audio tracking - audioQueue = []; // Clear the audio queue - - chatStreaming = true; + if ($showCallOverlay) { + console.log('Chat start event:', e); + await resetAssistantMessage(); + await tick(); + chatStreaming = true; + } }); eventTarget.addEventListener('chat', async (e) => { - const { content } = e.detail; - assistantMessage += content; - await tick(); + if ($showCallOverlay) { + const { content } = e.detail; + assistantMessage += content; + await tick(); - if (!interrupted) { - if ($config.audio.tts.engine !== '') { - assistantSentenceIdx = assistantSentences.length - 2; + if (!interrupted) { + if ($config.audio.tts.engine !== '') { + assistantSentenceIdx = assistantSentences.length - 2; - if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) { - await tick(); - setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx); + if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) { + await tick(); + setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx); + } } } - } - chatStreaming = true; + chatStreaming = true; + } }); eventTarget.addEventListener('chat:finish', async (e) => { - chatStreaming = false; - loading = false; + if ($showCallOverlay) { + chatStreaming = false; + loading = false; - console.log('Chat finish event:', e); - await tick(); + console.log('Chat finish event:', e); + await tick(); - if (!interrupted) { - if ($config.audio.tts.engine !== '') { - for (const [idx, sentence] of assistantSentences.entries()) { - if (!assistantSentenceAudios[idx]) { - await tick(); - setContentAudio(sentence, idx); + if (!interrupted) { + if ($config.audio.tts.engine !== '') { + for (const [idx, sentence] of assistantSentences.entries()) { + if (!assistantSentenceAudios[idx]) { + await tick(); + setContentAudio(sentence, idx); + } } + } else { + if ($settings?.showEmojiInCall ?? false) { + const res = await generateEmoji(localStorage.token, modelId, assistantMessage); + + if (res) { + console.log(res); + if (/\p{Extended_Pictographic}/u.test(res)) { + emoji = res.match(/\p{Extended_Pictographic}/gu)[0]; + } + } + } + + speakSpeechSynthesisHandler(assistantMessage); } - } else { - emoji = generateEmoji(localStorage.token, modelId, assistantMessage); - speakSpeechSynthesisHandler(assistantMessage); } } }); @@ -529,7 +597,7 @@ >