Merge pull request #2900 from open-webui/voice

feat: voice note
This commit is contained in:
Timothy Jaeryang Baek 2024-06-06 21:01:41 -07:00 committed by GitHub
commit ff4cf16742
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 893 additions and 627 deletions

View File

@ -17,7 +17,7 @@ from fastapi.middleware.cors import CORSMiddleware
from faster_whisper import WhisperModel
from pydantic import BaseModel
import uuid
import requests
import hashlib
from pathlib import Path
@ -181,8 +181,15 @@ def transcribe(
)
try:
filename = file.filename
file_path = f"{UPLOAD_DIR}/{filename}"
ext = file.filename.split(".")[-1]
id = uuid.uuid4()
filename = f"{id}.{ext}"
file_dir = f"{CACHE_DIR}/audio/transcriptions"
os.makedirs(file_dir, exist_ok=True)
file_path = f"{file_dir}/{filename}"
contents = file.file.read()
with open(file_path, "wb") as f:
f.write(contents)
@ -215,6 +222,11 @@ def transcribe(
transcript = "".join([segment.text for segment in list(segments)])
# save the transcript to a json file
transcript_file = f"{file_dir}/{id}.json"
with open(transcript_file, "w") as f:
json.dump({"transcript": transcript}, f)
return {"text": transcript.strip()}
except Exception as e:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,394 @@
<script lang="ts">
import { toast } from 'svelte-sonner';
import { createEventDispatcher, tick, getContext } from 'svelte';
import { settings } from '$lib/stores';
import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils';
import { transcribeAudio } from '$lib/apis/audio';
const i18n = getContext('i18n');
const dispatch = createEventDispatcher();
export let recording = false;
let loading = false;
let confirmed = false;
let durationSeconds = 0;
let durationCounter = null;
const startDurationCounter = () => {
durationCounter = setInterval(() => {
durationSeconds++;
}, 1000);
};
const stopDurationCounter = () => {
clearInterval(durationCounter);
durationSeconds = 0;
};
$: if (recording) {
startRecording();
} else {
stopRecording();
}
const formatSeconds = (seconds) => {
const minutes = Math.floor(seconds / 60);
const remainingSeconds = seconds % 60;
const formattedSeconds = remainingSeconds < 10 ? `0${remainingSeconds}` : remainingSeconds;
return `${minutes}:${formattedSeconds}`;
};
let speechRecognition;
let mediaRecorder;
let audioChunks = [];
const MIN_DECIBELS = -45;
const VISUALIZER_BUFFER_LENGTH = 300;
let visualizerData = Array(VISUALIZER_BUFFER_LENGTH).fill(0);
// Function to calculate the RMS level from time domain data
const calculateRMS = (data: Uint8Array) => {
let sumSquares = 0;
for (let i = 0; i < data.length; i++) {
const normalizedValue = (data[i] - 128) / 128; // Normalize the data
sumSquares += normalizedValue * normalizedValue;
}
return Math.sqrt(sumSquares / data.length);
};
const normalizeRMS = (rms) => {
rms = rms * 10;
const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
const scaledRMS = Math.pow(rms, exp);
// Scale between 0.01 (1%) and 1.0 (100%)
return Math.min(1.0, Math.max(0.01, scaledRMS));
};
const analyseAudio = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
const timeDomainData = new Uint8Array(analyser.fftSize);
let lastSoundTime = Date.now();
const detectSound = () => {
const processFrame = () => {
if (recording && !loading) {
analyser.getByteTimeDomainData(timeDomainData);
analyser.getByteFrequencyData(domainData);
// Calculate RMS level from time domain data
const rmsLevel = calculateRMS(timeDomainData);
// Push the calculated decibel level to visualizerData
visualizerData.push(normalizeRMS(rmsLevel));
// Ensure visualizerData array stays within the buffer length
if (visualizerData.length >= VISUALIZER_BUFFER_LENGTH) {
visualizerData.shift();
}
visualizerData = visualizerData;
if (domainData.some((value) => value > 0)) {
lastSoundTime = Date.now();
}
if (recording && Date.now() - lastSoundTime > 3000) {
if ($settings?.speechAutoSend ?? false) {
confirmRecording();
}
}
}
window.requestAnimationFrame(processFrame);
};
window.requestAnimationFrame(processFrame);
};
detectSound();
};
const transcribeHandler = async (audioBlob) => {
// Create a blob from the audio chunks
await tick();
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
console.log(res.text);
dispatch('confirm', res.text);
}
};
const saveRecording = (blob) => {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
document.body.appendChild(a);
a.style = 'display: none';
a.href = url;
a.download = 'recording.wav';
a.click();
window.URL.revokeObjectURL(url);
};
const startRecording = async () => {
startDurationCounter();
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
console.log('Recording started');
audioChunks = [];
analyseAudio(stream);
};
mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
mediaRecorder.onstop = async () => {
console.log('Recording stopped');
if (confirmed) {
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
await transcribeHandler(audioBlob);
confirmed = false;
loading = false;
}
audioChunks = [];
recording = false;
};
mediaRecorder.start();
};
const stopRecording = async () => {
if (recording && mediaRecorder) {
await mediaRecorder.stop();
}
stopDurationCounter();
audioChunks = [];
};
const confirmRecording = async () => {
loading = true;
confirmed = true;
if (recording && mediaRecorder) {
await mediaRecorder.stop();
}
clearInterval(durationCounter);
};
</script>
<div
class="{loading
? ' bg-gray-100/50 dark:bg-gray-850/50'
: 'bg-indigo-300/10 dark:bg-indigo-500/10 '} rounded-full flex p-2.5"
>
<div class="flex items-center mr-1">
<button
type="button"
class="p-1.5
{loading
? ' bg-gray-200 dark:bg-gray-700/50'
: 'bg-indigo-400/20 text-indigo-600 dark:text-indigo-300 '}
rounded-full"
on:click={async () => {
dispatch('cancel');
stopRecording();
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 24 24"
stroke-width="3"
stroke="currentColor"
class="size-4"
>
<path stroke-linecap="round" stroke-linejoin="round" d="M6 18 18 6M6 6l12 12" />
</svg>
</button>
</div>
<div
class="flex flex-1 self-center items-center justify-between ml-2 mx-1 overflow-hidden h-6"
dir="rtl"
>
<div class="flex-1 flex items-center gap-0.5 h-6">
{#each visualizerData.slice().reverse() as rms}
<div
class="w-[2px]
{loading
? ' bg-gray-500 dark:bg-gray-400 '
: 'bg-indigo-500 dark:bg-indigo-400 '}
inline-block h-full"
style="height: {Math.min(100, Math.max(14, rms * 100))}%;"
/>
{/each}
</div>
</div>
<div class=" mx-1.5 pr-1 flex justify-center items-center">
<div
class="text-sm
{loading ? ' text-gray-500 dark:text-gray-400 ' : ' text-indigo-400 '}
font-medium flex-1 mx-auto text-center"
>
{formatSeconds(durationSeconds)}
</div>
</div>
<div class="flex items-center mr-1">
{#if loading}
<div class=" text-gray-500 rounded-full cursor-not-allowed">
<svg
width="24"
height="24"
viewBox="0 0 24 24"
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
><style>
.spinner_OSmW {
transform-origin: center;
animation: spinner_T6mA 0.75s step-end infinite;
}
@keyframes spinner_T6mA {
8.3% {
transform: rotate(30deg);
}
16.6% {
transform: rotate(60deg);
}
25% {
transform: rotate(90deg);
}
33.3% {
transform: rotate(120deg);
}
41.6% {
transform: rotate(150deg);
}
50% {
transform: rotate(180deg);
}
58.3% {
transform: rotate(210deg);
}
66.6% {
transform: rotate(240deg);
}
75% {
transform: rotate(270deg);
}
83.3% {
transform: rotate(300deg);
}
91.6% {
transform: rotate(330deg);
}
100% {
transform: rotate(360deg);
}
}
</style><g class="spinner_OSmW"
><rect x="11" y="1" width="2" height="5" opacity=".14" /><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(30 12 12)"
opacity=".29"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(60 12 12)"
opacity=".43"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(90 12 12)"
opacity=".57"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(120 12 12)"
opacity=".71"
/><rect
x="11"
y="1"
width="2"
height="5"
transform="rotate(150 12 12)"
opacity=".86"
/><rect x="11" y="1" width="2" height="5" transform="rotate(180 12 12)" /></g
></svg
>
</div>
{:else}
<button
type="button"
class="p-1.5 bg-indigo-500 text-white dark:bg-indigo-500 dark:text-blue-950 rounded-full"
on:click={async () => {
await confirmRecording();
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
fill="none"
viewBox="0 0 24 24"
stroke-width="2.5"
stroke="currentColor"
class="size-4"
>
<path stroke-linecap="round" stroke-linejoin="round" d="m4.5 12.75 6 6 9-13.5" />
</svg>
</button>
{/if}
</div>
</div>
<style>
.visualizer {
display: flex;
height: 100%;
}
.visualizer-bar {
width: 2px;
background-color: #4a5aba; /* or whatever color you need */
}
</style>

View File

@ -168,7 +168,7 @@
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={STTEngine}
placeholder="Select a mode"
placeholder="Select an engine"
on:change={(e) => {
if (e.target.value !== '') {
navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) {
@ -182,30 +182,12 @@
}
}}
>
<option value="">{$i18n.t('Default (Web API)')}</option>
<option value="whisper-local">{$i18n.t('Whisper (Local)')}</option>
<option value="">{$i18n.t('Default (Whisper)')}</option>
<option value="web">{$i18n.t('Web API')}</option>
</select>
</div>
</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Conversation Mode')}</div>
<button
class="p-1 px-3 text-xs flex rounded transition"
on:click={() => {
toggleConversationMode();
}}
type="button"
>
{#if conversationMode === true}
<span class="ml-2 self-center">{$i18n.t('On')}</span>
{:else}
<span class="ml-2 self-center">{$i18n.t('Off')}</span>
{/if}
</button>
</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">
{$i18n.t('Auto-send input after 3 sec.')}

View File

@ -0,0 +1,20 @@
<script lang="ts">
export let className = 'w-4 h-4';
export let strokeWidth = '0';
</script>
<svg
aria-hidden="true"
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
viewBox="0 0 24 24"
stroke-width={strokeWidth}
stroke="currentColor"
class={className}
>
<path
fill-rule="evenodd"
d="M12 5a7 7 0 0 0-7 7v1.17c.313-.11.65-.17 1-.17h2a1 1 0 0 1 1 1v6a1 1 0 0 1-1 1H6a3 3 0 0 1-3-3v-6a9 9 0 0 1 18 0v6a3 3 0 0 1-3 3h-2a1 1 0 0 1-1-1v-6a1 1 0 0 1 1-1h2c.35 0 .687.06 1 .17V12a7 7 0 0 0-7-7Z"
clip-rule="evenodd"
/>
</svg>