feat: voice input refac

This commit is contained in:
Timothy J. Baek 2024-06-06 20:33:23 -07:00
parent 72e1615fe1
commit f97f6601f7
3 changed files with 842 additions and 603 deletions

View File

@ -11,8 +11,6 @@
} from '$lib/apis/rag';
import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS, WEBUI_BASE_URL } from '$lib/constants';
import { transcribeAudio } from '$lib/apis/audio';
import Prompts from './MessageInput/PromptCommands.svelte';
import Suggestions from './MessageInput/Suggestions.svelte';
import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte';
@ -21,7 +19,8 @@
import Tooltip from '../common/Tooltip.svelte';
import XMark from '$lib/components/icons/XMark.svelte';
import InputMenu from './MessageInput/InputMenu.svelte';
import { t } from 'i18next';
import Headphone from '../icons/Headphone.svelte';
import VoiceRecording from './MessageInput/VoiceRecording.svelte';
const i18n = getContext('i18n');
@ -33,6 +32,8 @@
export let atSelectedModel: Model | undefined;
export let selectedModels: [''];
let recording = false;
let chatTextAreaElement: HTMLTextAreaElement;
let filesInputElement;
@ -48,14 +49,11 @@
export let files = [];
export let speechRecognitionEnabled = true;
export let webSearchEnabled = false;
export let prompt = '';
export let messages = [];
let speechRecognition;
let visionCapableModels = [];
$: visionCapableModels = [...(atSelectedModel ? [atSelectedModel] : selectedModels)].filter(
(model) => $models.find((m) => m.id === model)?.info?.meta?.capabilities?.vision ?? true
@ -68,176 +66,11 @@
}
}
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
const MIN_DECIBELS = -45;
const scrollToBottom = () => {
const element = document.getElementById('messages-container');
element.scrollTop = element.scrollHeight;
};
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
isRecording = true;
console.log('Recording started');
};
mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
mediaRecorder.onstop = async () => {
isRecording = false;
console.log('Recording stopped');
// Create a blob from the audio chunks
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
prompt = res.text;
await tick();
chatTextAreaElement?.focus();
if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user);
}
}
// saveRecording(audioBlob);
audioChunks = [];
};
// Start recording
mediaRecorder.start();
// Monitor silence
monitorSilence(stream);
};
const monitorSilence = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
let lastSoundTime = Date.now();
const detectSound = () => {
analyser.getByteFrequencyData(domainData);
if (domainData.some((value) => value > 0)) {
lastSoundTime = Date.now();
}
if (isRecording && Date.now() - lastSoundTime > 3000) {
mediaRecorder.stop();
audioContext.close();
return;
}
window.requestAnimationFrame(detectSound);
};
window.requestAnimationFrame(detectSound);
};
const saveRecording = (blob) => {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
document.body.appendChild(a);
a.style = 'display: none';
a.href = url;
a.download = 'recording.wav';
a.click();
window.URL.revokeObjectURL(url);
};
const speechRecognitionHandler = () => {
// Check if SpeechRecognition is supported
if (isRecording) {
if (speechRecognition) {
speechRecognition.stop();
}
if (mediaRecorder) {
mediaRecorder.stop();
}
} else {
isRecording = true;
if ($settings?.audio?.STTEngine ?? '' !== '') {
startRecording();
} else {
if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
// Create a SpeechRecognition object
speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
// Set continuous to true for continuous recognition
speechRecognition.continuous = true;
// Set the timeout for turning off the recognition after inactivity (in milliseconds)
const inactivityTimeout = 3000; // 3 seconds
let timeoutId;
// Start recognition
speechRecognition.start();
// Event triggered when speech is recognized
speechRecognition.onresult = async (event) => {
// Clear the inactivity timeout
clearTimeout(timeoutId);
// Handle recognized speech
console.log(event);
const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
prompt = `${prompt}${transcript}`;
await tick();
chatTextAreaElement?.focus();
// Restart the inactivity timeout
timeoutId = setTimeout(() => {
console.log('Speech recognition turned off due to inactivity.');
speechRecognition.stop();
}, inactivityTimeout);
};
// Event triggered when recognition is ended
speechRecognition.onend = function () {
// Restart recognition after it ends
console.log('recognition ended');
isRecording = false;
if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user);
}
};
// Event triggered when an error occurs
speechRecognition.onerror = function (event) {
console.log(event);
toast.error($i18n.t(`Speech recognition error: {{error}}`, { error: event.error }));
isRecording = false;
};
} else {
toast.error($i18n.t('SpeechRecognition API is not supported in this browser.'));
}
}
}
};
const uploadDoc = async (file) => {
console.log(file);
@ -601,13 +434,37 @@
}
}}
/>
{#if recording}
<VoiceRecording
bind:recording
on:cancel={async () => {
recording = false;
await tick();
document.getElementById('chat-textarea')?.focus();
}}
on:confirm={async (e) => {
const response = e.detail;
prompt = `${prompt}${response} `;
recording = false;
await tick();
document.getElementById('chat-textarea')?.focus();
}}
/>
{:else}
<form
dir={$settings?.chatDirection ?? 'LTR'}
class=" flex flex-col relative w-full rounded-3xl px-1.5 bg-gray-50 dark:bg-gray-850 dark:text-gray-100"
class="w-full flex gap-1.5"
on:submit|preventDefault={() => {
// check if selectedModels support image input
submitPrompt(prompt, user);
}}
>
<div
class="flex-1 flex flex-col relative w-full rounded-3xl px-1.5 bg-gray-50 dark:bg-gray-850 dark:text-gray-100"
dir={$settings?.chatDirection ?? 'LTR'}
>
{#if files.length > 0}
<div class="mx-2 mt-2 mb-1 flex flex-wrap gap-2">
@ -615,7 +472,11 @@
<div class=" relative group">
{#if file.type === 'image'}
<div class="relative">
<img src={file.url} alt="input" class=" h-16 w-16 rounded-xl object-cover" />
<img
src={file.url}
alt="input"
class=" h-16 w-16 rounded-xl object-cover"
/>
{#if atSelectedModel ? visionCapableModels.length === 0 : selectedModels.length !== visionCapableModels.length}
<Tooltip
className=" absolute top-1 left-1"
@ -691,7 +552,12 @@
transform: translate(0);
}
}
</style><circle class="spinner_qM83" cx="4" cy="12" r="2.5" /><circle
</style><circle
class="spinner_qM83"
cx="4"
cy="12"
r="2.5"
/><circle
class="spinner_qM83 spinner_oXPr"
cx="12"
cy="12"
@ -771,7 +637,7 @@
{/if}
<div class=" flex">
<div class=" ml-1 self-end mb-2 flex space-x-1">
<div class=" ml-0.5 self-end mb-1.5 flex space-x-1">
<InputMenu
bind:webSearchEnabled
uploadFilesHandler={() => {
@ -783,7 +649,7 @@
}}
>
<button
class="bg-gray-50 hover:bg-gray-100 text-gray-800 dark:bg-gray-850 dark:text-white dark:hover:bg-gray-800 transition rounded-full p-1.5 outline-none focus:outline-none"
class="bg-gray-50 hover:bg-gray-100 text-gray-800 dark:bg-gray-850 dark:text-white dark:hover:bg-gray-800 transition rounded-full p-2 outline-none focus:outline-none"
type="button"
>
<svg
@ -803,11 +669,9 @@
<textarea
id="chat-textarea"
bind:this={chatTextAreaElement}
class="scrollbar-hidden bg-gray-50 dark:bg-gray-850 dark:text-gray-100 outline-none w-full py-3 px-3 rounded-xl resize-none h-[48px]"
class="scrollbar-hidden bg-gray-50 dark:bg-gray-850 dark:text-gray-100 outline-none w-full py-3 px-2 rounded-xl resize-none h-[48px]"
placeholder={chatInputPlaceholder !== ''
? chatInputPlaceholder
: isRecording
? $i18n.t('Listening...')
: $i18n.t('Send a Message')}
bind:value={prompt}
on:keypress={(e) => {
@ -974,58 +838,14 @@
<div class="self-end mb-2 flex space-x-1 mr-1">
{#if messages.length == 0 || messages.at(-1).done == true}
<Tooltip content={$i18n.t('Record voice')}>
{#if speechRecognitionEnabled}
<button
id="voice-input-button"
class=" text-gray-600 dark:text-gray-300 hover:bg-gray-50 dark:hover:bg-gray-850 transition rounded-full p-1.5 mr-0.5 self-center"
type="button"
on:click={() => {
speechRecognitionHandler();
recording = true;
}}
>
{#if isRecording}
<svg
class=" w-5 h-5 translate-y-[0.5px]"
fill="currentColor"
viewBox="0 0 24 24"
xmlns="http://www.w3.org/2000/svg"
><style>
.spinner_qM83 {
animation: spinner_8HQG 1.05s infinite;
}
.spinner_oXPr {
animation-delay: 0.1s;
}
.spinner_ZTLf {
animation-delay: 0.2s;
}
@keyframes spinner_8HQG {
0%,
57.14% {
animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
transform: translate(0);
}
28.57% {
animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
transform: translateY(-6px);
}
100% {
transform: translate(0);
}
}
</style><circle class="spinner_qM83" cx="4" cy="12" r="2.5" /><circle
class="spinner_qM83 spinner_oXPr"
cx="12"
cy="12"
r="2.5"
/><circle
class="spinner_qM83 spinner_ZTLf"
cx="20"
cy="12"
r="2.5"
/></svg
>
{:else}
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 20 20"
@ -1037,17 +857,32 @@
d="M5.5 9.643a.75.75 0 00-1.5 0V10c0 3.06 2.29 5.585 5.25 5.954V17.5h-1.5a.75.75 0 000 1.5h4.5a.75.75 0 000-1.5h-1.5v-1.546A6.001 6.001 0 0016 10v-.357a.75.75 0 00-1.5 0V10a4.5 4.5 0 01-9 0v-.357z"
/>
</svg>
{/if}
</button>
{/if}
</Tooltip>
{/if}
</div>
</div>
</div>
<div class="flex items-end w-10">
{#if messages.length == 0 || messages.at(-1).done == true}
{#if prompt === ''}
<div class=" flex items-center mb-1">
<Tooltip content={$i18n.t('Call')}>
<button
class=" text-gray-600 dark:text-gray-300 hover:bg-gray-50 dark:hover:bg-gray-850 transition rounded-full p-2 self-center"
>
<Headphone className="size-6" />
</button>
</Tooltip>
</div>
{:else}
<div class=" flex items-center mb-1">
<Tooltip content={$i18n.t('Send message')}>
<button
id="send-message-button"
class="{prompt !== ''
? 'bg-black text-white hover:bg-gray-900 dark:bg-white dark:text-black dark:hover:bg-gray-100 '
: 'text-white bg-gray-200 dark:text-gray-900 dark:bg-gray-700 disabled'} transition rounded-full p-1.5 self-center"
: 'text-white bg-gray-200 dark:text-gray-900 dark:bg-gray-700 disabled'} transition rounded-full p-1.5 m-0.5 self-center"
type="submit"
disabled={prompt === ''}
>
@ -1055,7 +890,7 @@
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 16 16"
fill="currentColor"
class="w-5 h-5"
class="size-6"
>
<path
fill-rule="evenodd"
@ -1065,16 +900,21 @@
</svg>
</button>
</Tooltip>
</div>
{/if}
{:else}
<div class=" flex items-center mb-1.5">
<button
class="bg-white hover:bg-gray-100 text-gray-800 dark:bg-gray-700 dark:text-white dark:hover:bg-gray-800 transition rounded-full p-1.5"
on:click={stopResponse}
on:click={() => {
stopResponse();
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 24 24"
fill="currentColor"
class="w-5 h-5"
class="size-6"
>
<path
fill-rule="evenodd"
@ -1083,12 +923,13 @@
/>
</svg>
</button>
</div>
{/if}
</div>
</div>
</form>
{/if}
<div class="mt-1.5 text-xs text-gray-500 text-center">
<div class="mt-1.5 text-xs text-gray-500 text-center line-clamp-1">
{$i18n.t('LLMs can make mistakes. Verify important information.')}
</div>
</div>

View File

@ -0,0 +1,378 @@
<script lang="ts">
import { toast } from 'svelte-sonner';
import { createEventDispatcher, tick, getContext } from 'svelte';
import { settings } from '$lib/stores';
import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils';
import { transcribeAudio } from '$lib/apis/audio';
const i18n = getContext('i18n');
const dispatch = createEventDispatcher();
export let recording = false;
let loading = false;
let confirmed = false;
let durationSeconds = 0;
let durationCounter = null;
const startDurationCounter = () => {
durationCounter = setInterval(() => {
durationSeconds++;
}, 1000);
};
const stopDurationCounter = () => {
clearInterval(durationCounter);
durationSeconds = 0;
};
$: if (recording) {
startRecording();
} else {
stopRecording();
}
const formatSeconds = (seconds) => {
const minutes = Math.floor(seconds / 60);
const remainingSeconds = seconds % 60;
const formattedSeconds = remainingSeconds < 10 ? `0${remainingSeconds}` : remainingSeconds;
return `${minutes}:${formattedSeconds}`;
};
let speechRecognition;
let mediaRecorder;
let audioChunks = [];
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
let visualizerData = Array(VISUALIZER_BUFFER_LENGTH).fill(0);
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
const normalizeRMS = (rms) => {
rms = rms * 10;
const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
const scaledRMS = Math.pow(rms, exp);
// Scale between 0.01 (1%) and 1.0 (100%)
return Math.min(1.0, Math.max(0.01, scaledRMS));
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const timeDomainData = new Uint8Array(analyser.fftSize);
const detectSound = () => {
const processFrame = () => {
if (recording && !loading) {
analyser.getByteTimeDomainData(timeDomainData);
// Calculate RMS level from time domain data
const rmsLevel = calculateRMS(timeDomainData);
// Push the calculated decibel level to visualizerData
visualizerData.push(normalizeRMS(rmsLevel));
// Ensure visualizerData array stays within the buffer length
if (visualizerData.length >= VISUALIZER_BUFFER_LENGTH) {
visualizerData.shift();
}
visualizerData = visualizerData;
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
dispatch('confirm', res.text);
}
};
const saveRecording = (blob) => {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
document.body.appendChild(a);
a.style = 'display: none';
a.href = url;
a.download = 'recording.wav';
a.click();
window.URL.revokeObjectURL(url);
};
const startRecording = async () => {
startDurationCounter();
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
if (confirmed) {
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
audioChunks = [];
recording = false;
};
mediaRecorder.start();
};
const stopRecording = async () => {
if (recording && mediaRecorder) {
await mediaRecorder.stop();
}
stopDurationCounter();
audioChunks = [];
};
const confirmRecording = async () => {
loading = true;
confirmed = true;
if (recording && mediaRecorder) {
await mediaRecorder.stop();
}
clearInterval(durationCounter);
};
</script>
<div
class="{loading
? ' bg-gray-100/50 dark:bg-gray-850/50'
: 'bg-indigo-300/10 dark:bg-indigo-500/10 '} rounded-full flex p-2.5"
>
<div class="flex items-center mr-1">
<button
type="button"
class="p-1.5
{loading
? ' bg-gray-200 dark:bg-gray-700/50'
: 'bg-indigo-400/20 text-indigo-600 dark:text-indigo-300 '}
rounded-full"
on:click={async () => {
dispatch('cancel');
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 24 24"
stroke-width="3"
stroke="currentColor"
class="size-4"
>
<path stroke-linecap="round" stroke-linejoin="round" d="M6 18 18 6M6 6l12 12" />
</svg>
</button>
</div>
<div
class="flex flex-1 self-center items-center justify-between ml-2 mx-1 overflow-hidden h-6"
dir="rtl"
>
<div class="flex-1 flex items-center gap-0.5 h-6">
{#each visualizerData.slice().reverse() as rms}
<div
class="w-[2px]
{loading
? ' bg-gray-500 dark:bg-gray-400 '
: 'bg-indigo-500 dark:bg-indigo-400 '}
inline-block h-full"
style="height: {Math.min(100, Math.max(14, rms * 100))}%;"
/>
{/each}
</div>
</div>
<div class=" mx-1.5 pr-1 flex justify-center items-center">
<div
class="text-sm
{loading ? ' text-gray-500 dark:text-gray-400 ' : ' text-indigo-400 '}
font-medium flex-1 mx-auto text-center"
>
{formatSeconds(durationSeconds)}
</div>
</div>
<div class="flex items-center mr-1">
{#if loading}
<div class=" text-gray-500 rounded-full cursor-not-allowed">
<svg
width="24"
height="24"
viewBox="0 0 24 24"
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
><style>
.spinner_OSmW {
transform-origin: center;
animation: spinner_T6mA 0.75s step-end infinite;
}
@keyframes spinner_T6mA {
8.3% {
transform: rotate(30deg);
}
16.6% {
transform: rotate(60deg);
}
25% {
transform: rotate(90deg);
}
33.3% {
transform: rotate(120deg);
}
41.6% {
transform: rotate(150deg);
}
50% {
transform: rotate(180deg);
}
58.3% {
transform: rotate(210deg);
}
66.6% {
transform: rotate(240deg);
}
75% {
transform: rotate(270deg);
}
83.3% {
transform: rotate(300deg);
}
91.6% {
transform: rotate(330deg);
}
100% {
transform: rotate(360deg);
}
}
</style><g class="spinner_OSmW"
><rect x="11" y="1" width="2" height="5" opacity=".14" /><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(30 12 12)"
opacity=".29"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(60 12 12)"
opacity=".43"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(90 12 12)"
opacity=".57"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(120 12 12)"
opacity=".71"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(150 12 12)"
opacity=".86"
/><rect x="11" y="1" width="2" height="5" transform="rotate(180 12 12)" /></g
></svg
>
</div>
{:else}
<button
type="button"
class="p-1.5 bg-indigo-500 text-white dark:bg-indigo-500 dark:text-blue-950 rounded-full"
on:click={async () => {
await confirmRecording();
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 24 24"
stroke-width="2.5"
stroke="currentColor"
class="size-4"
>
<path stroke-linecap="round" stroke-linejoin="round" d="m4.5 12.75 6 6 9-13.5" />
</svg>
</button>
{/if}
</div>
</div>
<style>
.visualizer {
display: flex;
height: 100%;
}
.visualizer-bar {
width: 2px;
background-color: #4a5aba; /* or whatever color you need */
}
</style>

View File

@ -0,0 +1,20 @@
<script lang="ts">
export let className = 'w-4 h-4';
export let strokeWidth = '0';
</script>
<svg
aria-hidden="true"
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
viewBox="0 0 24 24"
stroke-width={strokeWidth}
stroke="currentColor"
class={className}
>
<path
fill-rule="evenodd"
d="M12 5a7 7 0 0 0-7 7v1.17c.313-.11.65-.17 1-.17h2a1 1 0 0 1 1 1v6a1 1 0 0 1-1 1H6a3 3 0 0 1-3-3v-6a9 9 0 0 1 18 0v6a3 3 0 0 1-3 3h-2a1 1 0 0 1-1-1v-6a1 1 0 0 1 1-1h2c.35 0 .687.06 1 .17V12a7 7 0 0 0-7-7Z"
clip-rule="evenodd"
/>
</svg>