mirror of
https://github.com/open-webui/open-webui
synced 2024-11-17 14:02:51 +00:00
refac: voice call
This commit is contained in:
parent
7ea572fdca
commit
7f70de99d3
@ -30,6 +30,7 @@
|
||||
import {
|
||||
convertMessagesToHistory,
|
||||
copyToClipboard,
|
||||
extractSentencesForAudio,
|
||||
promptTemplate,
|
||||
splitStream
|
||||
} from '$lib/utils';
|
||||
@ -593,7 +594,15 @@
|
||||
array.findIndex((i) => JSON.stringify(i) === JSON.stringify(item)) === index
|
||||
);
|
||||
|
||||
eventTarget.dispatchEvent(new CustomEvent('chat:start'));
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat:start', {
|
||||
detail: {
|
||||
id: responseMessageId
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
await tick();
|
||||
|
||||
const [res, controller] = await generateChatCompletion(localStorage.token, {
|
||||
model: model.id,
|
||||
@ -664,9 +673,23 @@
|
||||
continue;
|
||||
} else {
|
||||
responseMessage.content += data.message.content;
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat', { detail: { content: data.message.content } })
|
||||
);
|
||||
|
||||
const sentences = extractSentencesForAudio(responseMessage.content);
|
||||
sentences.pop();
|
||||
|
||||
// dispatch only last sentence and make sure it hasn't been dispatched before
|
||||
if (
|
||||
sentences.length > 0 &&
|
||||
sentences[sentences.length - 1] !== responseMessage.lastSentence
|
||||
) {
|
||||
responseMessage.lastSentence = sentences[sentences.length - 1];
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat', {
|
||||
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
messages = messages;
|
||||
}
|
||||
} else {
|
||||
@ -760,7 +783,23 @@
|
||||
|
||||
stopResponseFlag = false;
|
||||
await tick();
|
||||
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
|
||||
|
||||
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
|
||||
if (lastSentence) {
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat', {
|
||||
detail: { id: responseMessageId, content: lastSentence }
|
||||
})
|
||||
);
|
||||
}
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat:finish', {
|
||||
detail: {
|
||||
id: responseMessageId,
|
||||
content: responseMessage.content
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
if (autoScroll) {
|
||||
scrollToBottom();
|
||||
@ -802,7 +841,14 @@
|
||||
|
||||
scrollToBottom();
|
||||
|
||||
eventTarget.dispatchEvent(new CustomEvent('chat:start'));
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat:start', {
|
||||
detail: {
|
||||
id: responseMessageId
|
||||
}
|
||||
})
|
||||
);
|
||||
await tick();
|
||||
|
||||
try {
|
||||
const [res, controller] = await generateOpenAIChatCompletion(
|
||||
@ -924,7 +970,23 @@
|
||||
continue;
|
||||
} else {
|
||||
responseMessage.content += value;
|
||||
eventTarget.dispatchEvent(new CustomEvent('chat', { detail: { content: value } }));
|
||||
|
||||
const sentences = extractSentencesForAudio(responseMessage.content);
|
||||
sentences.pop();
|
||||
|
||||
// dispatch only last sentence and make sure it hasn't been dispatched before
|
||||
if (
|
||||
sentences.length > 0 &&
|
||||
sentences[sentences.length - 1] !== responseMessage.lastSentence
|
||||
) {
|
||||
responseMessage.lastSentence = sentences[sentences.length - 1];
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat', {
|
||||
detail: { id: responseMessageId, content: sentences[sentences.length - 1] }
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
messages = messages;
|
||||
}
|
||||
|
||||
@ -975,7 +1037,23 @@
|
||||
stopResponseFlag = false;
|
||||
await tick();
|
||||
|
||||
eventTarget.dispatchEvent(new CustomEvent('chat:finish'));
|
||||
let lastSentence = extractSentencesForAudio(responseMessage.content)?.at(-1) ?? '';
|
||||
if (lastSentence) {
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat', {
|
||||
detail: { id: responseMessageId, content: lastSentence }
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
eventTarget.dispatchEvent(
|
||||
new CustomEvent('chat:finish', {
|
||||
detail: {
|
||||
id: responseMessageId,
|
||||
content: responseMessage.content
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
if (autoScroll) {
|
||||
scrollToBottom();
|
||||
@ -1207,14 +1285,18 @@
|
||||
</title>
|
||||
</svelte:head>
|
||||
|
||||
<CallOverlay
|
||||
{submitPrompt}
|
||||
{stopResponse}
|
||||
bind:files
|
||||
modelId={selectedModelIds?.at(0) ?? null}
|
||||
chatId={$chatId}
|
||||
{eventTarget}
|
||||
/>
|
||||
<audio id="audioElement" src="" style="display: none;" />
|
||||
|
||||
{#if $showCallOverlay}
|
||||
<CallOverlay
|
||||
{submitPrompt}
|
||||
{stopResponse}
|
||||
bind:files
|
||||
modelId={selectedModelIds?.at(0) ?? null}
|
||||
chatId={$chatId}
|
||||
{eventTarget}
|
||||
/>
|
||||
{/if}
|
||||
|
||||
{#if !chatIdProp || (loaded && chatIdProp)}
|
||||
<div
|
||||
|
@ -2,7 +2,12 @@
|
||||
import { config, settings, showCallOverlay } from '$lib/stores';
|
||||
import { onMount, tick, getContext } from 'svelte';
|
||||
|
||||
import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
|
||||
import {
|
||||
blobToFile,
|
||||
calculateSHA256,
|
||||
extractSentencesForAudio,
|
||||
findWordIndices
|
||||
} from '$lib/utils';
|
||||
import { generateEmoji } from '$lib/apis';
|
||||
import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
|
||||
|
||||
@ -32,34 +37,7 @@
|
||||
let camera = false;
|
||||
let cameraStream = null;
|
||||
|
||||
let assistantSpeaking = false;
|
||||
|
||||
let chatStreaming = false;
|
||||
let assistantMessage = '';
|
||||
let assistantSentences = [];
|
||||
let assistantSentenceAudios = {};
|
||||
let assistantSentenceIdx = -1;
|
||||
|
||||
let audioQueue = [];
|
||||
let emojiQueue = [];
|
||||
|
||||
$: assistantSentences = extractSentences(assistantMessage).reduce((mergedTexts, currentText) => {
|
||||
const lastIndex = mergedTexts.length - 1;
|
||||
if (lastIndex >= 0) {
|
||||
const previousText = mergedTexts[lastIndex];
|
||||
const wordCount = previousText.split(/\s+/).length;
|
||||
if (wordCount < 2) {
|
||||
mergedTexts[lastIndex] = previousText + ' ' + currentText;
|
||||
} else {
|
||||
mergedTexts.push(currentText);
|
||||
}
|
||||
} else {
|
||||
mergedTexts.push(currentText);
|
||||
}
|
||||
return mergedTexts;
|
||||
}, []);
|
||||
|
||||
let currentUtterance = null;
|
||||
|
||||
let rmsLevel = 0;
|
||||
let hasStartedSpeaking = false;
|
||||
@ -170,75 +148,6 @@
|
||||
const MIN_DECIBELS = -45;
|
||||
const VISUALIZER_BUFFER_LENGTH = 300;
|
||||
|
||||
// Function to calculate the RMS level from time domain data
|
||||
const calculateRMS = (data: Uint8Array) => {
|
||||
let sumSquares = 0;
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
|
||||
sumSquares += normalizedValue * normalizedValue;
|
||||
}
|
||||
return Math.sqrt(sumSquares / data.length);
|
||||
};
|
||||
|
||||
const analyseAudio = (stream) => {
|
||||
const audioContext = new AudioContext();
|
||||
const audioStreamSource = audioContext.createMediaStreamSource(stream);
|
||||
|
||||
const analyser = audioContext.createAnalyser();
|
||||
analyser.minDecibels = MIN_DECIBELS;
|
||||
audioStreamSource.connect(analyser);
|
||||
|
||||
const bufferLength = analyser.frequencyBinCount;
|
||||
|
||||
const domainData = new Uint8Array(bufferLength);
|
||||
const timeDomainData = new Uint8Array(analyser.fftSize);
|
||||
|
||||
let lastSoundTime = Date.now();
|
||||
hasStartedSpeaking = false;
|
||||
|
||||
const detectSound = () => {
|
||||
const processFrame = () => {
|
||||
if (!mediaRecorder || !$showCallOverlay) {
|
||||
return;
|
||||
}
|
||||
|
||||
analyser.getByteTimeDomainData(timeDomainData);
|
||||
analyser.getByteFrequencyData(domainData);
|
||||
|
||||
// Calculate RMS level from time domain data
|
||||
rmsLevel = calculateRMS(timeDomainData);
|
||||
|
||||
// Check if initial speech/noise has started
|
||||
const hasSound = domainData.some((value) => value > 0);
|
||||
if (hasSound) {
|
||||
hasStartedSpeaking = true;
|
||||
lastSoundTime = Date.now();
|
||||
|
||||
// BIG RED TEXT
|
||||
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
|
||||
stopAllAudio();
|
||||
}
|
||||
|
||||
// Start silence detection only after initial speech/noise has been detected
|
||||
if (hasStartedSpeaking) {
|
||||
if (Date.now() - lastSoundTime > 2000) {
|
||||
confirmed = true;
|
||||
|
||||
if (mediaRecorder) {
|
||||
mediaRecorder.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
window.requestAnimationFrame(processFrame);
|
||||
};
|
||||
|
||||
window.requestAnimationFrame(processFrame);
|
||||
};
|
||||
|
||||
detectSound();
|
||||
};
|
||||
|
||||
const transcribeHandler = async (audioBlob) => {
|
||||
// Create a blob from the audio chunks
|
||||
|
||||
@ -260,174 +169,6 @@
|
||||
}
|
||||
};
|
||||
|
||||
const stopAllAudio = async () => {
|
||||
interrupted = true;
|
||||
|
||||
if (chatStreaming) {
|
||||
stopResponse();
|
||||
}
|
||||
|
||||
if (currentUtterance) {
|
||||
speechSynthesis.cancel();
|
||||
currentUtterance = null;
|
||||
}
|
||||
|
||||
await tick();
|
||||
emojiQueue = [];
|
||||
audioQueue = [];
|
||||
await tick();
|
||||
|
||||
const audioElement = document.getElementById('audioElement');
|
||||
if (audioElement) {
|
||||
audioElement.pause();
|
||||
audioElement.currentTime = 0;
|
||||
}
|
||||
|
||||
assistantSpeaking = false;
|
||||
};
|
||||
|
||||
const speakSpeechSynthesisHandler = (content) => {
|
||||
if ($showCallOverlay) {
|
||||
return new Promise((resolve) => {
|
||||
let voices = [];
|
||||
const getVoicesLoop = setInterval(async () => {
|
||||
voices = await speechSynthesis.getVoices();
|
||||
if (voices.length > 0) {
|
||||
clearInterval(getVoicesLoop);
|
||||
|
||||
const voice =
|
||||
voices
|
||||
?.filter(
|
||||
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
|
||||
)
|
||||
?.at(0) ?? undefined;
|
||||
|
||||
currentUtterance = new SpeechSynthesisUtterance(content);
|
||||
|
||||
if (voice) {
|
||||
currentUtterance.voice = voice;
|
||||
}
|
||||
|
||||
speechSynthesis.speak(currentUtterance);
|
||||
currentUtterance.onend = async (e) => {
|
||||
await new Promise((r) => setTimeout(r, 100));
|
||||
resolve(e);
|
||||
};
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
} else {
|
||||
return Promise.resolve();
|
||||
}
|
||||
};
|
||||
|
||||
const playAudio = (audio) => {
|
||||
if ($showCallOverlay) {
|
||||
return new Promise((resolve) => {
|
||||
const audioElement = document.getElementById('audioElement');
|
||||
|
||||
if (audioElement) {
|
||||
audioElement.src = audio.src;
|
||||
audioElement.muted = true;
|
||||
|
||||
audioElement
|
||||
.play()
|
||||
.then(() => {
|
||||
audioElement.muted = false;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
});
|
||||
|
||||
audioElement.onended = async (e) => {
|
||||
await new Promise((r) => setTimeout(r, 100));
|
||||
resolve(e);
|
||||
};
|
||||
}
|
||||
});
|
||||
} else {
|
||||
return Promise.resolve();
|
||||
}
|
||||
};
|
||||
|
||||
const playAudioHandler = async () => {
|
||||
console.log('playAudioHandler', audioQueue, assistantSpeaking, audioQueue.length > 0);
|
||||
if (!assistantSpeaking && !interrupted && audioQueue.length > 0) {
|
||||
assistantSpeaking = true;
|
||||
|
||||
if ($settings?.showEmojiInCall ?? false) {
|
||||
if (emojiQueue.length > 0) {
|
||||
emoji = emojiQueue.shift();
|
||||
emojiQueue = emojiQueue;
|
||||
}
|
||||
}
|
||||
|
||||
const audioToPlay = audioQueue.shift(); // Shift the audio out from queue before playing.
|
||||
audioQueue = audioQueue;
|
||||
await playAudio(audioToPlay);
|
||||
assistantSpeaking = false;
|
||||
}
|
||||
};
|
||||
|
||||
const setContentAudio = async (content, idx) => {
|
||||
if (assistantSentenceAudios[idx] === undefined) {
|
||||
// Wait for the previous audio to be loaded
|
||||
if (idx > 0) {
|
||||
await new Promise((resolve) => {
|
||||
const check = setInterval(() => {
|
||||
if (
|
||||
assistantSentenceAudios[idx - 1] !== undefined &&
|
||||
assistantSentenceAudios[idx - 1] !== null
|
||||
) {
|
||||
clearInterval(check);
|
||||
resolve();
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
}
|
||||
|
||||
assistantSentenceAudios[idx] = null;
|
||||
|
||||
if ($settings?.showEmojiInCall ?? false) {
|
||||
const sentenceEmoji = await generateEmoji(localStorage.token, modelId, content);
|
||||
|
||||
if (sentenceEmoji) {
|
||||
// Big red text with content and emoji
|
||||
console.log('%c%s', 'color: blue; font-size: 10px;', `${sentenceEmoji}: ${content}`);
|
||||
|
||||
if (/\p{Extended_Pictographic}/u.test(sentenceEmoji)) {
|
||||
emojiQueue.push(sentenceEmoji.match(/\p{Extended_Pictographic}/gu)[0]);
|
||||
emojiQueue = emojiQueue;
|
||||
}
|
||||
}
|
||||
|
||||
await tick();
|
||||
}
|
||||
|
||||
const res = await synthesizeOpenAISpeech(
|
||||
localStorage.token,
|
||||
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
|
||||
content
|
||||
).catch((error) => {
|
||||
toast.error(error);
|
||||
assistantSpeaking = false;
|
||||
return null;
|
||||
});
|
||||
|
||||
if (res) {
|
||||
const blob = await res.blob();
|
||||
const blobUrl = URL.createObjectURL(blob);
|
||||
const audio = new Audio(blobUrl);
|
||||
assistantSentenceAudios[idx] = audio;
|
||||
|
||||
console.log('%c%s', 'color: red; font-size: 20px;', content);
|
||||
|
||||
audioQueue.push(audio);
|
||||
audioQueue = audioQueue;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const stopRecordingCallback = async (_continue = true) => {
|
||||
if ($showCallOverlay) {
|
||||
console.log('%c%s', 'color: red; font-size: 20px;', '🚨 stopRecordingCallback 🚨');
|
||||
@ -489,107 +230,315 @@
|
||||
mediaRecorder.start();
|
||||
};
|
||||
|
||||
const resetAssistantMessage = async () => {
|
||||
interrupted = false;
|
||||
|
||||
assistantMessage = '';
|
||||
assistantSentenceIdx = -1;
|
||||
assistantSentenceAudios = {}; // Reset audio tracking
|
||||
audioQueue = []; // Clear the audio queue
|
||||
audioQueue = audioQueue;
|
||||
|
||||
emoji = null;
|
||||
emojiQueue = [];
|
||||
emojiQueue = emojiQueue;
|
||||
// Function to calculate the RMS level from time domain data
|
||||
const calculateRMS = (data: Uint8Array) => {
|
||||
let sumSquares = 0;
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
|
||||
sumSquares += normalizedValue * normalizedValue;
|
||||
}
|
||||
return Math.sqrt(sumSquares / data.length);
|
||||
};
|
||||
|
||||
$: (async () => {
|
||||
const analyseAudio = (stream) => {
|
||||
const audioContext = new AudioContext();
|
||||
const audioStreamSource = audioContext.createMediaStreamSource(stream);
|
||||
|
||||
const analyser = audioContext.createAnalyser();
|
||||
analyser.minDecibels = MIN_DECIBELS;
|
||||
audioStreamSource.connect(analyser);
|
||||
|
||||
const bufferLength = analyser.frequencyBinCount;
|
||||
|
||||
const domainData = new Uint8Array(bufferLength);
|
||||
const timeDomainData = new Uint8Array(analyser.fftSize);
|
||||
|
||||
let lastSoundTime = Date.now();
|
||||
hasStartedSpeaking = false;
|
||||
|
||||
const detectSound = () => {
|
||||
const processFrame = () => {
|
||||
if (!mediaRecorder || !$showCallOverlay) {
|
||||
return;
|
||||
}
|
||||
|
||||
analyser.getByteTimeDomainData(timeDomainData);
|
||||
analyser.getByteFrequencyData(domainData);
|
||||
|
||||
// Calculate RMS level from time domain data
|
||||
rmsLevel = calculateRMS(timeDomainData);
|
||||
|
||||
// Check if initial speech/noise has started
|
||||
const hasSound = domainData.some((value) => value > 0);
|
||||
if (hasSound) {
|
||||
// BIG RED TEXT
|
||||
console.log('%c%s', 'color: red; font-size: 20px;', '🔊 Sound detected');
|
||||
|
||||
if (!hasStartedSpeaking) {
|
||||
hasStartedSpeaking = true;
|
||||
stopAllAudio();
|
||||
}
|
||||
|
||||
lastSoundTime = Date.now();
|
||||
}
|
||||
|
||||
// Start silence detection only after initial speech/noise has been detected
|
||||
if (hasStartedSpeaking) {
|
||||
if (Date.now() - lastSoundTime > 2000) {
|
||||
confirmed = true;
|
||||
|
||||
if (mediaRecorder) {
|
||||
mediaRecorder.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
window.requestAnimationFrame(processFrame);
|
||||
};
|
||||
|
||||
window.requestAnimationFrame(processFrame);
|
||||
};
|
||||
|
||||
detectSound();
|
||||
};
|
||||
|
||||
let finishedMessages = {};
|
||||
let currentMessageId = null;
|
||||
let currentUtterance = null;
|
||||
|
||||
const speakSpeechSynthesisHandler = (content) => {
|
||||
if ($showCallOverlay) {
|
||||
await resetAssistantMessage();
|
||||
await tick();
|
||||
startRecording();
|
||||
return new Promise((resolve) => {
|
||||
let voices = [];
|
||||
const getVoicesLoop = setInterval(async () => {
|
||||
voices = await speechSynthesis.getVoices();
|
||||
if (voices.length > 0) {
|
||||
clearInterval(getVoicesLoop);
|
||||
|
||||
const voice =
|
||||
voices
|
||||
?.filter(
|
||||
(v) => v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
|
||||
)
|
||||
?.at(0) ?? undefined;
|
||||
|
||||
currentUtterance = new SpeechSynthesisUtterance(content);
|
||||
|
||||
if (voice) {
|
||||
currentUtterance.voice = voice;
|
||||
}
|
||||
|
||||
speechSynthesis.speak(currentUtterance);
|
||||
currentUtterance.onend = async (e) => {
|
||||
await new Promise((r) => setTimeout(r, 100));
|
||||
resolve(e);
|
||||
};
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
} else {
|
||||
stopCamera();
|
||||
stopAllAudio();
|
||||
stopRecordingCallback(false);
|
||||
return Promise.resolve();
|
||||
}
|
||||
})();
|
||||
};
|
||||
|
||||
$: {
|
||||
if (audioQueue.length > 0 && !assistantSpeaking) {
|
||||
playAudioHandler();
|
||||
}
|
||||
}
|
||||
const playAudio = (audio) => {
|
||||
if ($showCallOverlay) {
|
||||
return new Promise((resolve) => {
|
||||
const audioElement = document.getElementById('audioElement');
|
||||
|
||||
onMount(() => {
|
||||
eventTarget.addEventListener('chat:start', async (e) => {
|
||||
if ($showCallOverlay) {
|
||||
console.log('Chat start event:', e);
|
||||
await resetAssistantMessage();
|
||||
await tick();
|
||||
chatStreaming = true;
|
||||
}
|
||||
});
|
||||
if (audioElement) {
|
||||
audioElement.src = audio.src;
|
||||
audioElement.muted = true;
|
||||
|
||||
eventTarget.addEventListener('chat', async (e) => {
|
||||
if ($showCallOverlay) {
|
||||
const { content } = e.detail;
|
||||
assistantMessage += content;
|
||||
await tick();
|
||||
audioElement
|
||||
.play()
|
||||
.then(() => {
|
||||
audioElement.muted = false;
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(error);
|
||||
});
|
||||
|
||||
if (!interrupted) {
|
||||
if ($config.audio.tts.engine !== '') {
|
||||
assistantSentenceIdx = assistantSentences.length - 2;
|
||||
|
||||
if (assistantSentenceIdx >= 0 && !assistantSentenceAudios[assistantSentenceIdx]) {
|
||||
await tick();
|
||||
setContentAudio(assistantSentences[assistantSentenceIdx], assistantSentenceIdx);
|
||||
}
|
||||
}
|
||||
audioElement.onended = async (e) => {
|
||||
await new Promise((r) => setTimeout(r, 100));
|
||||
resolve(e);
|
||||
};
|
||||
}
|
||||
});
|
||||
} else {
|
||||
return Promise.resolve();
|
||||
}
|
||||
};
|
||||
|
||||
chatStreaming = true;
|
||||
const stopAllAudio = async () => {
|
||||
interrupted = true;
|
||||
|
||||
if (chatStreaming) {
|
||||
stopResponse();
|
||||
}
|
||||
|
||||
if (currentUtterance) {
|
||||
speechSynthesis.cancel();
|
||||
currentUtterance = null;
|
||||
}
|
||||
|
||||
const audioElement = document.getElementById('audioElement');
|
||||
if (audioElement) {
|
||||
audioElement.pause();
|
||||
audioElement.currentTime = 0;
|
||||
}
|
||||
};
|
||||
|
||||
let audioAbortController = new AbortController();
|
||||
|
||||
// Audio cache map where key is the content and value is the Audio object.
|
||||
const audioCache = new Map();
|
||||
const fetchAudio = async (content) => {
|
||||
if (!audioCache.has(content)) {
|
||||
try {
|
||||
const res = await synthesizeOpenAISpeech(
|
||||
localStorage.token,
|
||||
$settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice,
|
||||
content
|
||||
).catch((error) => {
|
||||
console.error(error);
|
||||
return null;
|
||||
});
|
||||
|
||||
if (res) {
|
||||
const blob = await res.blob();
|
||||
const blobUrl = URL.createObjectURL(blob);
|
||||
audioCache.set(content, new Audio(blobUrl));
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error synthesizing speech:', error);
|
||||
}
|
||||
});
|
||||
}
|
||||
return audioCache.get(content);
|
||||
};
|
||||
|
||||
eventTarget.addEventListener('chat:finish', async (e) => {
|
||||
if ($showCallOverlay) {
|
||||
chatStreaming = false;
|
||||
loading = false;
|
||||
let messages = {};
|
||||
|
||||
console.log('Chat finish event:', e);
|
||||
await tick();
|
||||
const monitorAndPlayAudio = async (id, signal) => {
|
||||
while (!signal.aborted) {
|
||||
if (messages[id] && messages[id].length > 0) {
|
||||
// Retrieve the next content string from the queue
|
||||
const content = messages[id].shift(); // Dequeues the content for playing
|
||||
|
||||
if (!interrupted) {
|
||||
if ($config.audio.tts.engine !== '') {
|
||||
for (const [idx, sentence] of assistantSentences.entries()) {
|
||||
if (!assistantSentenceAudios[idx]) {
|
||||
await tick();
|
||||
setContentAudio(sentence, idx);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ($settings?.showEmojiInCall ?? false) {
|
||||
const res = await generateEmoji(localStorage.token, modelId, assistantMessage);
|
||||
if (audioCache.has(content)) {
|
||||
// If content is available in the cache, play it
|
||||
try {
|
||||
console.log(
|
||||
'%c%s',
|
||||
'color: red; font-size: 20px;',
|
||||
`Playing audio for content: ${content}`
|
||||
);
|
||||
|
||||
if (res) {
|
||||
console.log(res);
|
||||
if (/\p{Extended_Pictographic}/u.test(res)) {
|
||||
emoji = res.match(/\p{Extended_Pictographic}/gu)[0];
|
||||
}
|
||||
}
|
||||
const audio = audioCache.get(content);
|
||||
await playAudio(audio); // Here ensure that playAudio is indeed correct method to execute
|
||||
console.log(`Played audio for content: ${content}`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
|
||||
} catch (error) {
|
||||
console.error('Error playing audio:', error);
|
||||
}
|
||||
} else {
|
||||
// If not available in the cache, push it back to the queue and delay
|
||||
messages[id].unshift(content); // Re-queue the content at the start
|
||||
console.log(`Audio for "${content}" not yet available in the cache, re-queued...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait before retrying to reduce tight loop
|
||||
}
|
||||
} else if (finishedMessages[id] && messages[id] && messages[id].length === 0) {
|
||||
// If the message is finished and there are no more messages to process, break the loop
|
||||
break;
|
||||
} else {
|
||||
// No messages to process, sleep for a bit
|
||||
await new Promise((resolve) => setTimeout(resolve, 200));
|
||||
}
|
||||
}
|
||||
console.log(`Audio monitoring and playing stopped for message ID ${id}`);
|
||||
};
|
||||
|
||||
onMount(async () => {
|
||||
startRecording();
|
||||
|
||||
const chatStartHandler = async (e) => {
|
||||
const { id } = e.detail;
|
||||
|
||||
chatStreaming = true;
|
||||
|
||||
if ($config.audio.tts.engine !== '') {
|
||||
// set currentMessageId to id
|
||||
if (currentMessageId !== id) {
|
||||
console.log(`Received chat start event for message ID ${id}`);
|
||||
|
||||
currentMessageId = id;
|
||||
if (audioAbortController) {
|
||||
audioAbortController.abort();
|
||||
}
|
||||
audioAbortController = new AbortController();
|
||||
|
||||
// Start monitoring and playing audio for the message ID
|
||||
monitorAndPlayAudio(id, audioAbortController.signal);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const chatEventHandler = async (e) => {
|
||||
const { id, content } = e.detail;
|
||||
// "id" here is message id
|
||||
// if "id" is not the same as "currentMessageId" then do not process
|
||||
// "content" here is a sentence from the assistant,
|
||||
// there will be many sentences for the same "id"
|
||||
|
||||
if ($config.audio.tts.engine !== '') {
|
||||
if (currentMessageId === id) {
|
||||
console.log(`Received chat event for message ID ${id}: ${content}`);
|
||||
|
||||
try {
|
||||
if (messages[id] === undefined) {
|
||||
messages[id] = [content];
|
||||
} else {
|
||||
messages[id].push(content);
|
||||
}
|
||||
|
||||
speakSpeechSynthesisHandler(assistantMessage);
|
||||
console.log(content);
|
||||
|
||||
fetchAudio(content);
|
||||
} catch (error) {
|
||||
console.error('Failed to fetch or play audio:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
const chatFinishHandler = async (e) => {
|
||||
const { id, content } = e.detail;
|
||||
// "content" here is the entire message from the assistant
|
||||
|
||||
chatStreaming = false;
|
||||
|
||||
if ($config.audio.tts.engine !== '') {
|
||||
finishedMessages[id] = true;
|
||||
} else {
|
||||
speakSpeechSynthesisHandler(content);
|
||||
}
|
||||
};
|
||||
|
||||
eventTarget.addEventListener('chat:start', chatStartHandler);
|
||||
eventTarget.addEventListener('chat', chatEventHandler);
|
||||
eventTarget.addEventListener('chat:finish', chatFinishHandler);
|
||||
|
||||
return async () => {
|
||||
eventTarget.removeEventListener('chat:start', chatStartHandler);
|
||||
eventTarget.removeEventListener('chat', chatEventHandler);
|
||||
eventTarget.removeEventListener('chat:finish', chatFinishHandler);
|
||||
|
||||
await stopRecordingCallback(false);
|
||||
await stopCamera();
|
||||
};
|
||||
});
|
||||
</script>
|
||||
|
||||
<audio id="audioElement" src="" style="display: none;" />
|
||||
|
||||
{#if $showCallOverlay}
|
||||
<div class=" absolute w-full h-screen max-h-[100dvh] flex z-[999] overflow-hidden">
|
||||
<div
|
||||
|
@ -443,6 +443,24 @@ export const extractSentences = (text) => {
|
||||
.filter((sentence) => sentence !== '');
|
||||
};
|
||||
|
||||
export const extractSentencesForAudio = (text) => {
|
||||
return extractSentences(text).reduce((mergedTexts, currentText) => {
|
||||
const lastIndex = mergedTexts.length - 1;
|
||||
if (lastIndex >= 0) {
|
||||
const previousText = mergedTexts[lastIndex];
|
||||
const wordCount = previousText.split(/\s+/).length;
|
||||
if (wordCount < 2) {
|
||||
mergedTexts[lastIndex] = previousText + ' ' + currentText;
|
||||
} else {
|
||||
mergedTexts.push(currentText);
|
||||
}
|
||||
} else {
|
||||
mergedTexts.push(currentText);
|
||||
}
|
||||
return mergedTexts;
|
||||
}, []);
|
||||
};
|
||||
|
||||
export const blobToFile = (blob, fileName) => {
|
||||
// Create a new File object from the Blob
|
||||
const file = new File([blob], fileName, { type: blob.type });
|
||||
|
Loading…
Reference in New Issue
Block a user