This commit is contained in:
Timothy J. Baek 2024-06-13 01:28:15 -07:00
parent d6fd2a8228
commit 5300d2c531
3 changed files with 362 additions and 345 deletions

View File

@ -887,7 +887,7 @@ async def generate_emoji(form_data: dict, user=Depends(get_verified_user)):
model = app.state.MODELS[model_id] model = app.state.MODELS[model_id]
template = ''' template = '''
You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please avoid using generic or overly ambiguous emojis like "🤔", and instead, choose ones that vividly represent the speaker's mood or reaction. You are a perceptive assistant skilled at interpreting emotions from a provided message. Your task is to reflect the speaker's likely facial expression through a fitting emoji. Prioritize using diverse facial expression emojis to convey the nuanced emotions expressed in the text. Please choose ones that vividly represent the speaker's mood or reaction.
Message: """{{prompt}}""" Message: """{{prompt}}"""
''' '''

View File

@ -1209,6 +1209,7 @@
<CallOverlay <CallOverlay
{submitPrompt} {submitPrompt}
{stopResponse}
bind:files bind:files
modelId={selectedModelIds?.at(0) ?? null} modelId={selectedModelIds?.at(0) ?? null}
chatId={$chatId} chatId={$chatId}

View File

@ -14,16 +14,18 @@
const i18n = getContext('i18n'); const i18n = getContext('i18n');
export let eventTarget: EventTarget; export let eventTarget: EventTarget;
export let submitPrompt: Function; export let submitPrompt: Function;
export let stopResponse: Function;
export let files; export let files;
export let chatId; export let chatId;
export let modelId; export let modelId;
let message = '';
let loading = false; let loading = false;
let confirmed = false; let confirmed = false;
let interrupted = false;
let emoji = null; let emoji = null;
@ -31,245 +33,16 @@
let cameraStream = null; let cameraStream = null;
let assistantSpeaking = false; let assistantSpeaking = false;
let assistantAudio = {};
let assistantAudioIdx = null;
let rmsLevel = 0; let chatStreaming = false;
let hasStartedSpeaking = false; let assistantMessage = '';
let assistantSentences = [];
let assistantSentenceAudios = {};
let assistantSentenceIdx = -1;
let currentUtterance = null; let audioQueue = [];
let mediaRecorder; $: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
let audioChunks = [];
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
const normalizeRMS = (rms) => {
rms = rms * 10;
const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
const scaledRMS = Math.pow(rms, exp);
// Scale between 0.01 (1%) and 1.0 (100%)
return Math.min(1.0, Math.max(0.01, scaledRMS));
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
hasStartedSpeaking = false;
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
if (mediaRecorder) {
mediaRecorder.stop();
}
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
rmsLevel = calculateRMS(timeDomainData);
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
stopAllAudio();
hasStartedSpeaking = true;
lastSoundTime = Date.now();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
confirmed = true;
if (mediaRecorder) {
mediaRecorder.stop();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
const stopAllAudio = () => {
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
if (assistantAudio[assistantAudioIdx]) {
assistantAudio[assistantAudioIdx].pause();
assistantAudio[assistantAudioIdx].currentTime = 0;
}
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
const playAudio = (idx) => {
if ($showCallOverlay) {
return new Promise((res) => {
assistantAudioIdx = idx;
const audioElement = document.getElementById('audioElement');
const audio = assistantAudio[idx];
if (audioElement) {
audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
audioElement.muted = true;
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
toast.error(error);
});
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 300));
if (Object.keys(assistantAudio).length - 1 === idx) {
assistantSpeaking = false;
}
res(e);
};
}
});
} else {
return Promise.resolve();
}
};
const getOpenAISpeech = async (text) => {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
text
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantAudio = audio;
}
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
if (res.text !== '') {
const _responses = await submitPrompt(res.text, { _raw: true });
console.log(_responses);
}
}
};
const assistantSpeakingHandler = async (content) => {
assistantSpeaking = true;
if (modelId && ($settings?.showEmojiInCall ?? false)) {
console.log('Generating emoji');
const res = await generateEmoji(localStorage.token, modelId, content, chatId).catch(
(error) => {
console.error(error);
return null;
}
);
if (res) {
console.log(res);
if (/\p{Extended_Pictographic}/u.test(res)) {
emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
}
}
}
if (($config.audio.tts.engine ?? '') == '') {
let voices = [];
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (voices.length > 0) {
clearInterval(getVoicesLoop);
const voice =
voices
?.filter(
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
)
?.at(0) ?? undefined;
currentUtterance = new SpeechSynthesisUtterance(content);
if (voice) {
currentUtterance.voice = voice;
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async () => {
assistantSpeaking = false;
};
}
}, 100);
} else if ($config.audio.tts.engine === 'openai') {
console.log('openai');
const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
const lastIndex = mergedTexts.length - 1; const lastIndex = mergedTexts.length - 1;
if (lastIndex >= 0) { if (lastIndex >= 0) {
const previousText = mergedTexts[lastIndex]; const previousText = mergedTexts[lastIndex];
@ -285,94 +58,14 @@
return mergedTexts; return mergedTexts;
}, []); }, []);
console.log(sentences); let currentUtterance = null;
let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately let rmsLevel = 0;
let hasStartedSpeaking = false;
let mediaRecorder;
let audioChunks = [];
for (const [idx, sentence] of sentences.entries()) { $: console.log('hasStartedSpeaking', hasStartedSpeaking);
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
sentence
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantAudio[idx] = audio;
lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
if (idx === sentences.length - 1) {
lastPlayedAudioPromise.then(() => {
assistantSpeaking = false;
});
}
}
}
}
};
const stopRecordingCallback = async (_continue = true) => {
if ($showCallOverlay) {
if (confirmed) {
loading = true;
emoji = null;
if (cameraStream) {
const imageUrl = takeScreenshot();
files = [
{
type: 'image',
url: imageUrl
}
];
}
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
} else {
audioChunks = [];
mediaRecorder = false;
}
};
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => {
if (hasStartedSpeaking) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
};
let videoInputDevices = []; let videoInputDevices = [];
let selectedVideoInputDeviceId = null; let selectedVideoInputDeviceId = null;
@ -475,6 +168,286 @@
camera = false; camera = false;
}; };
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
hasStartedSpeaking = false;
const detectSound = () => {
const processFrame = () => {
if (!mediaRecorder || !$showCallOverlay) {
return;
}
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
rmsLevel = calculateRMS(timeDomainData);
// Check if initial speech/noise has started
const hasSound = domainData.some((value) => value > 0);
if (hasSound) {
hasStartedSpeaking = true;
lastSoundTime = Date.now();
// BIG RED TEXT
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
stopAllAudio();
}
// Start silence detection only after initial speech/noise has been detected
if (hasStartedSpeaking) {
if (Date.now() - lastSoundTime > 2000) {
confirmed = true;
if (mediaRecorder) {
mediaRecorder.stop();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
if (res.text !== '') {
const _responses = await submitPrompt(res.text, { _raw: true });
console.log(_responses);
}
}
};
const stopAllAudio = async () => {
interrupted = true;
if (chatStreaming) {
stopResponse();
}
if (currentUtterance) {
speechSynthesis.cancel();
currentUtterance = null;
}
await tick();
audioQueue = [];
await tick();
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.pause();
audioElement.currentTime = 0;
}
assistantSpeaking = false;
};
const speakSpeechSynthesisHandler = (content) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
let voices = [];
const getVoicesLoop = setInterval(async () => {
voices = await speechSynthesis.getVoices();
if (voices.length > 0) {
clearInterval(getVoicesLoop);
const voice =
voices
?.filter(
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
)
?.at(0) ?? undefined;
currentUtterance = new SpeechSynthesisUtterance(content);
if (voice) {
currentUtterance.voice = voice;
}
speechSynthesis.speak(currentUtterance);
currentUtterance.onend = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
}, 100);
});
} else {
return Promise.resolve();
}
};
const playAudio = (audio) => {
if ($showCallOverlay) {
return new Promise((resolve) => {
const audioElement = document.getElementById('audioElement');
if (audioElement) {
audioElement.src = audio.src;
audioElement.muted = true;
audioElement
.play()
.then(() => {
audioElement.muted = false;
})
.catch((error) => {
console.error(error);
});
audioElement.onended = async (e) => {
await new Promise((r) => setTimeout(r, 100));
resolve(e);
};
}
});
} else {
return Promise.resolve();
}
};
const playAudioHandler = async () => {
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
assistantSpeaking = true;
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
audioQueue = audioQueue;
await playAudio(audioToPlay);
assistantSpeaking = false;
}
};
const setContentAudio = async (content, idx) => {
if (assistantSentenceAudios[idx] === undefined) {
console.log('%c%s', 'color: red; font-size: 20px;', content);
assistantSentenceAudios[idx] = null;
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
content
).catch((error) => {
toast.error(error);
assistantSpeaking = false;
return null;
});
if (res) {
const blob = await res.blob();
const blobUrl = URL.createObjectURL(blob);
const audio = new Audio(blobUrl);
assistantSentenceAudios[idx] = audio;
audioQueue.push(audio);
audioQueue = audioQueue;
}
}
};
const stopRecordingCallback = async (_continue = true) => {
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
if ($showCallOverlay) {
// deep copy the audioChunks array
const _audioChunks = audioChunks.slice(0);
audioChunks = [];
mediaRecorder = false;
if (_continue) {
startRecording();
}
if (confirmed) {
loading = true;
emoji = null;
if (cameraStream) {
const imageUrl = takeScreenshot();
files = [
{
type: 'image',
url: imageUrl
}
];
}
const audioBlob = new Blob(_audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
} else {
audioChunks = [];
mediaRecorder = false;
}
};
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => {
if (hasStartedSpeaking) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
await stopRecordingCallback();
};
mediaRecorder.start();
};
$: if ($showCallOverlay) { $: if ($showCallOverlay) {
startRecording(); startRecording();
} else { } else {
@ -483,30 +456,73 @@
stopRecordingCallback(false); stopRecordingCallback(false);
} }
$: {
if (audioQueue.length > 0 && !assistantSpeaking) {
playAudioHandler();
}
}
onMount(() => { onMount(() => {
console.log(eventTarget); console.log(eventTarget);
eventTarget.addEventListener('chat:start', async (e) => { eventTarget.addEventListener('chat:start', async (e) => {
console.log('Chat start event:', e.detail); console.log('Chat start event:', e);
message = ''; interrupted = false;
assistantMessage = '';
assistantSentenceIdx = -1;
assistantSentenceAudios = {}; // Reset audio tracking
audioQueue = []; // Clear the audio queue
chatStreaming = true;
}); });
eventTarget.addEventListener('chat', async (e) => { eventTarget.addEventListener('chat', async (e) => {
const { content } = e.detail; const { content } = e.detail;
assistantMessage += content;
await tick();
message += content; if (!interrupted) {
console.log('Chat event:', message); if ($config.audio.tts.engine !== '') {
assistantSentenceIdx = assistantSentences.length - 2;
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
await tick();
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
}
}
}
chatStreaming = true;
}); });
eventTarget.addEventListener('chat:finish', async (e) => { eventTarget.addEventListener('chat:finish', async (e) => {
console.log('Chat finish event:', e.detail); chatStreaming = false;
message = ''; loading = false;
console.log('Chat finish event:', e);
await tick();
if (!interrupted) {
if ($config.audio.tts.engine !== '') {
for (const [idx, sentence] of assistantSentences.entries()) {
if (!assistantSentenceAudios[idx]) {
await tick();
setContentAudio(sentence, idx);
}
}
} else {
emoji = generateEmoji(localStorage.token, modelId, assistantMessage);
speakSpeechSynthesisHandler(assistantMessage);
}
}
}); });
}); });
</script> </script>
<audio id="audioElement" src="" style="display: none;" />
{#if $showCallOverlay} {#if $showCallOverlay}
<audio id="audioElement" src="" style="display: none;" />
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden"> <div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
<div <div
class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center" class="absolute w-full h-screen max-h-[100dvh] bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"