From 3967c34261d2f8cabbe000403327d4fc6648eaba Mon Sep 17 00:00:00 2001 From: kiosion Date: Sat, 24 Aug 2024 20:35:42 -0400 Subject: [PATCH] feat: Add control for how message content is split for TTS generation reqs --- backend/apps/audio/main.py | 6 + backend/config.py | 6 + backend/main.py | 1 + src/lib/apis/audio/index.ts | 6 +- .../components/admin/Settings/Audio.svelte | 61 +++- .../chat/Messages/ResponseMessage.svelte | 287 ++++++++++-------- src/lib/types/index.ts | 6 + src/lib/utils/index.ts | 61 +++- 8 files changed, 277 insertions(+), 157 deletions(-) diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py index d66a9fa11..46be15364 100644 --- a/backend/apps/audio/main.py +++ b/backend/apps/audio/main.py @@ -37,6 +37,7 @@ from config import ( AUDIO_TTS_ENGINE, AUDIO_TTS_MODEL, AUDIO_TTS_VOICE, + AUDIO_TTS_SPLIT_ON, AppConfig, CORS_ALLOW_ORIGIN, ) @@ -72,6 +73,7 @@ app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE app.state.config.TTS_MODEL = AUDIO_TTS_MODEL app.state.config.TTS_VOICE = AUDIO_TTS_VOICE app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY +app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON # setting device type for whisper model whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu" @@ -88,6 +90,7 @@ class TTSConfigForm(BaseModel): ENGINE: str MODEL: str VOICE: str + SPLIT_ON: str class STTConfigForm(BaseModel): @@ -139,6 +142,7 @@ async def get_audio_config(user=Depends(get_admin_user)): "ENGINE": app.state.config.TTS_ENGINE, "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, + "SPLIT_ON": app.state.config.TTS_SPLIT_ON, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -159,6 +163,7 @@ async def update_audio_config( app.state.config.TTS_ENGINE = form_data.tts.ENGINE app.state.config.TTS_MODEL = form_data.tts.MODEL app.state.config.TTS_VOICE = form_data.tts.VOICE + app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY @@ -173,6 +178,7 @@ async def update_audio_config( "ENGINE": app.state.config.TTS_ENGINE, "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, + "SPLIT_ON": app.state.config.TTS_SPLIT_ON, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, diff --git a/backend/config.py b/backend/config.py index adb2e1fd5..5cf8ba21a 100644 --- a/backend/config.py +++ b/backend/config.py @@ -1484,3 +1484,9 @@ AUDIO_TTS_VOICE = PersistentConfig( "audio.tts.voice", os.getenv("AUDIO_TTS_VOICE", "alloy"), # OpenAI default voice ) + +AUDIO_TTS_SPLIT_ON = PersistentConfig( + "AUDIO_TTS_SPLIT_ON", + "audio.tts.split_on", + os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"), +) diff --git a/backend/main.py b/backend/main.py index b8ed68111..349afe25f 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1924,6 +1924,7 @@ async def get_app_config(request: Request): "tts": { "engine": audio_app.state.config.TTS_ENGINE, "voice": audio_app.state.config.TTS_VOICE, + "split_on": audio_app.state.config.TTS_SPLIT_ON, }, "stt": { "engine": audio_app.state.config.STT_ENGINE, diff --git a/src/lib/apis/audio/index.ts b/src/lib/apis/audio/index.ts index af09af990..5cd6ab949 100644 --- a/src/lib/apis/audio/index.ts +++ b/src/lib/apis/audio/index.ts @@ -132,7 +132,11 @@ export const synthesizeOpenAISpeech = async ( return res; }; -export const getModels = async (token: string = '') => { +interface AvailableModelsResponse { + models: { name: string; id: string }[] | { id: string }[]; +} + +export const getModels = async (token: string = ''): Promise => { let error = null; const res = await fetch(`${AUDIO_API_BASE_URL}/models`, { diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 7c3300568..7302558be 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -10,31 +10,36 @@ getModels as _getModels, getVoices as _getVoices } from '$lib/apis/audio'; - import { user, settings, config } from '$lib/stores'; + import { config } from '$lib/stores'; import SensitiveInput from '$lib/components/common/SensitiveInput.svelte'; - const i18n = getContext('i18n'); + import { TTS_RESPONSE_SPLIT } from '$lib/types'; - export let saveHandler: Function; + import type { Writable } from 'svelte/store'; + import type { i18n as i18nType } from 'i18next'; + + const i18n = getContext>('i18n'); + + export let saveHandler: () => void; // Audio - let TTS_OPENAI_API_BASE_URL = ''; let TTS_OPENAI_API_KEY = ''; let TTS_API_KEY = ''; let TTS_ENGINE = ''; let TTS_MODEL = ''; let TTS_VOICE = ''; + let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION; let STT_OPENAI_API_BASE_URL = ''; let STT_OPENAI_API_KEY = ''; let STT_ENGINE = ''; let STT_MODEL = ''; - let voices = []; - let models = []; - let nonLocalVoices = false; + // eslint-disable-next-line no-undef + let voices: SpeechSynthesisVoice[] = []; + let models: Awaited>['models'] = []; const getModels = async () => { if (TTS_ENGINE === '') { @@ -53,8 +58,8 @@ const getVoices = async () => { if (TTS_ENGINE === '') { - const getVoicesLoop = setInterval(async () => { - voices = await speechSynthesis.getVoices(); + const getVoicesLoop = setInterval(() => { + voices = speechSynthesis.getVoices(); // do your loop if (voices.length > 0) { @@ -81,7 +86,8 @@ API_KEY: TTS_API_KEY, ENGINE: TTS_ENGINE, MODEL: TTS_MODEL, - VOICE: TTS_VOICE + VOICE: TTS_VOICE, + SPLIT_ON: TTS_SPLIT_ON }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, @@ -92,9 +98,8 @@ }); if (res) { - toast.success($i18n.t('Audio settings updated successfully')); - - config.set(await getBackendConfig()); + saveHandler(); + getBackendConfig().then(config.set).catch(() => {}); } }; @@ -111,6 +116,8 @@ TTS_MODEL = res.tts.MODEL; TTS_VOICE = res.tts.VOICE; + TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION; + STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; @@ -139,7 +146,7 @@
{$i18n.t('Speech-to-Text Engine')}
{ @@ -203,7 +210,7 @@ await getVoices(); await getModels(); - if (e.target.value === 'openai') { + if (e.target?.value === 'openai') { TTS_VOICE = 'alloy'; TTS_MODEL = 'tts-1'; } else { @@ -351,6 +358,28 @@
{/if} + +
+ +
+
{$i18n.t('Response splitting')}
+
+ +
+
+
+ {$i18n.t( + "Choose how to split response text for speech synthesis. 'Punctuation' splits by sentences, 'paragraphs' splits by paragraphs, and 'none' sends the response as a single string." + )} +
diff --git a/src/lib/components/chat/Messages/ResponseMessage.svelte b/src/lib/components/chat/Messages/ResponseMessage.svelte index eac388eb0..21daadb2c 100644 --- a/src/lib/components/chat/Messages/ResponseMessage.svelte +++ b/src/lib/components/chat/Messages/ResponseMessage.svelte @@ -2,11 +2,10 @@ import { toast } from 'svelte-sonner'; import dayjs from 'dayjs'; - import { fade } from 'svelte/transition'; import { createEventDispatcher } from 'svelte'; import { onMount, tick, getContext } from 'svelte'; - const i18n = getContext('i18n'); + const i18n = getContext>('i18n'); const dispatch = createEventDispatcher(); @@ -15,20 +14,18 @@ import { imageGenerations } from '$lib/apis/images'; import { approximateToHumanReadable, - extractSentences, - replaceTokens, - processResponseContent + extractParagraphsForAudio, + extractSentencesForAudio, + prepareTextForTTS, } from '$lib/utils'; import { WEBUI_BASE_URL } from '$lib/constants'; import Name from './Name.svelte'; import ProfileImage from './ProfileImage.svelte'; import Skeleton from './Skeleton.svelte'; - import CodeBlock from './CodeBlock.svelte'; import Image from '$lib/components/common/Image.svelte'; import Tooltip from '$lib/components/common/Tooltip.svelte'; import RateComment from './RateComment.svelte'; - import CitationsModal from '$lib/components/chat/Messages/CitationsModal.svelte'; import Spinner from '$lib/components/common/Spinner.svelte'; import WebSearchResults from './ResponseMessage/WebSearchResults.svelte'; import Sparkles from '$lib/components/icons/Sparkles.svelte'; @@ -36,7 +33,38 @@ import Error from './Error.svelte'; import Citations from './Citations.svelte'; - export let message; + import type { Writable } from 'svelte/store'; + import type { i18n as i18nType } from 'i18next'; + import { TTS_RESPONSE_SPLIT } from '$lib/types'; + + interface MessageType { + id: string; + model: string; + content: string; + files?: { type: string; url: string }[]; + timestamp: number; + role: string; + statusHistory?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; }[]; + status?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; }; + done: boolean; + error?: boolean | { content: string }; + citations?: string[]; + info?: { + openai?: boolean; + prompt_tokens?: number; + completion_tokens?: number; + total_tokens?: number; + eval_count?: number; + eval_duration?: number; + prompt_eval_count?: number; + prompt_eval_duration?: number; + total_duration?: number; + load_duration?: number; + }; + annotation?: { type: string; rating: number; }; + } + + export let message: MessageType; export let siblings; export let isLastMessage = true; @@ -60,28 +88,33 @@ let editedContent = ''; let editTextAreaElement: HTMLTextAreaElement; - let sentencesAudio = {}; - let speaking = null; - let speakingIdx = null; + let audioParts: Record = {}; + let speaking = false; + let speakingIdx: number | undefined; let loadingSpeech = false; let generatingImage = false; let showRateComment = false; - const playAudio = (idx) => { - return new Promise((res) => { + const playAudio = (idx: number) => { + return new Promise((res) => { speakingIdx = idx; - const audio = sentencesAudio[idx]; + const audio = audioParts[idx]; + + if (!audio) { + return res(); + } + audio.play(); - audio.onended = async (e) => { + audio.onended = async () => { await new Promise((r) => setTimeout(r, 300)); - if (Object.keys(sentencesAudio).length - 1 === idx) { - speaking = null; + if (Object.keys(audioParts).length - 1 === idx) { + speaking = false; } - res(e); + res(); }; }); }; @@ -91,113 +124,119 @@ try { speechSynthesis.cancel(); - sentencesAudio[speakingIdx].pause(); - sentencesAudio[speakingIdx].currentTime = 0; + if (speakingIdx !== undefined && audioParts[speakingIdx]) { + audioParts[speakingIdx]!.pause(); + audioParts[speakingIdx]!.currentTime = 0; + } } catch {} - speaking = null; - speakingIdx = null; - } else { - if ((message?.content ?? '').trim() !== '') { - speaking = true; + speaking = false; + speakingIdx = undefined; + return; + } - if ($config.audio.tts.engine !== '') { - loadingSpeech = true; + if (!(message?.content ?? '').trim().length) { + toast.info($i18n.t('No content to speak')); + return; + } - const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => { - const lastIndex = mergedTexts.length - 1; - if (lastIndex >= 0) { - const previousText = mergedTexts[lastIndex]; - const wordCount = previousText.split(/\s+/).length; - if (wordCount < 2) { - mergedTexts[lastIndex] = previousText + ' ' + currentText; - } else { - mergedTexts.push(currentText); - } - } else { - mergedTexts.push(currentText); - } - return mergedTexts; - }, []); + speaking = true; - console.log(sentences); + if ($config.audio.tts.engine !== '') { + loadingSpeech = true; - if (sentences.length > 0) { - sentencesAudio = sentences.reduce((a, e, i, arr) => { - a[i] = null; - return a; - }, {}); + const preparedMessageContent: string[] = []; - let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately - - for (const [idx, sentence] of sentences.entries()) { - const res = await synthesizeOpenAISpeech( - localStorage.token, - $settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice - ? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) - : $config?.audio?.tts?.voice, - sentence - ).catch((error) => { - toast.error(error); - - speaking = null; - loadingSpeech = false; - - return null; - }); - - if (res) { - const blob = await res.blob(); - const blobUrl = URL.createObjectURL(blob); - const audio = new Audio(blobUrl); - sentencesAudio[idx] = audio; - loadingSpeech = false; - lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); - } - } - } else { - speaking = null; - loadingSpeech = false; - } - } else { - let voices = []; - const getVoicesLoop = setInterval(async () => { - voices = await speechSynthesis.getVoices(); - if (voices.length > 0) { - clearInterval(getVoicesLoop); - - const voice = - voices - ?.filter( - (v) => - v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) - ) - ?.at(0) ?? undefined; - - console.log(voice); - - const speak = new SpeechSynthesisUtterance(message.content); - - console.log(speak); - - speak.onend = () => { - speaking = null; - if ($settings.conversationMode) { - document.getElementById('voice-input-button')?.click(); - } - }; - - if (voice) { - speak.voice = voice; - } - - speechSynthesis.speak(speak); - } - }, 100); - } - } else { - toast.error($i18n.t('No content to speak')); + switch ($config.audio.tts.split_on) { + default: + case TTS_RESPONSE_SPLIT.PUNCTUATION: + preparedMessageContent.push(...extractSentencesForAudio(message.content)); + break; + case TTS_RESPONSE_SPLIT.PARAGRAPHS: + preparedMessageContent.push(...extractParagraphsForAudio(message.content)); + break; + case TTS_RESPONSE_SPLIT.NONE: + preparedMessageContent.push(prepareTextForTTS(message.content)); + break; } + + if (!preparedMessageContent.length) { + console.log('No content to speak'); + toast.info($i18n.t('No content to speak')); + + speaking = false; + loadingSpeech = false; + return; + } + + console.debug('Prepared message content for TTS', preparedMessageContent); + + audioParts = preparedMessageContent.reduce((acc, _sentence, idx) => { + acc[idx] = null; + return acc; + }, {} as typeof audioParts); + + let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately + + for (const [idx, sentence] of preparedMessageContent.entries()) { + const res = await synthesizeOpenAISpeech( + localStorage.token, + $settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice + ? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) + : $config?.audio?.tts?.voice, + sentence + ).catch((error) => { + console.error(error); + toast.error(error); + + speaking = false; + loadingSpeech = false; + }); + + if (res) { + const blob = await res.blob(); + const blobUrl = URL.createObjectURL(blob); + const audio = new Audio(blobUrl); + audioParts[idx] = audio; + loadingSpeech = false; + lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx)); + } + } + } else { + let voices = []; + const getVoicesLoop = setInterval(() => { + voices = speechSynthesis.getVoices(); + if (voices.length > 0) { + clearInterval(getVoicesLoop); + + const voice = + voices + ?.filter( + (v) => + v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice) + ) + ?.at(0) ?? undefined; + + console.log(voice); + + const speak = new SpeechSynthesisUtterance(message.content); + + console.log(speak); + + speak.onend = () => { + speaking = false; + if ($settings.conversationMode) { + document.getElementById('voice-input-button')?.click(); + } + }; + + if (voice) { + speak.voice = voice; + } + + speechSynthesis.speak(speak); + } + }, 100); } }; @@ -230,7 +269,7 @@ await tick(); }; - const generateImage = async (message) => { + const generateImage = async (message: MessageType) => { generatingImage = true; const res = await imageGenerations(localStorage.token, message.content).catch((error) => { toast.error(error); @@ -285,7 +324,7 @@
- {#if (message?.files ?? []).filter((f) => f.type === 'image').length > 0} + {#if message?.files && message.files?.filter((f) => f.type === 'image').length > 0}
{#each message.files as file}
@@ -304,7 +343,7 @@ message?.statusHistory ?? [...(message?.status ? [message?.status] : [])] ).at(-1)}
- {#if status.done === false} + {#if status?.done === false}
@@ -521,7 +560,7 @@ : 'invisible group-hover:visible'} p-1.5 hover:bg-black/5 dark:hover:bg-white/5 rounded-lg dark:hover:text-white hover:text-black transition" on:click={() => { if (!loadingSpeech) { - toggleSpeakMessage(message); + toggleSpeakMessage(); } }} > @@ -661,7 +700,7 @@ `${ Math.round( ((message.info.eval_count ?? 0) / - (message.info.eval_duration / 1000000000)) * + ((message.info.eval_duration ?? 0) / 1000000000)) * 100 ) / 100 } tokens` ?? 'N/A' @@ -669,7 +708,7 @@ prompt_token/s: ${ Math.round( ((message.info.prompt_eval_count ?? 0) / - (message.info.prompt_eval_duration / 1000000000)) * + ((message.info.prompt_eval_duration ?? 0) / 1000000000)) * 100 ) / 100 ?? 'N/A' } tokens
@@ -688,7 +727,7 @@ eval_duration: ${ Math.round(((message.info.eval_duration ?? 0) / 1000000) * 100) / 100 ?? 'N/A' }ms
- approximate_total: ${approximateToHumanReadable(message.info.total_duration)}`} + approximate_total: ${approximateToHumanReadable((message.info.total_duration ?? 0))}`} placement="top" > diff --git a/src/lib/types/index.ts b/src/lib/types/index.ts index 2d9156c8d..5b20e4e8b 100644 --- a/src/lib/types/index.ts +++ b/src/lib/types/index.ts @@ -7,3 +7,9 @@ export type Banner = { dismissible?: boolean; timestamp: number; }; + +export enum TTS_RESPONSE_SPLIT { + PUNCTUATION = 'punctuation', + PARAGRAPHS = 'paragraphs', + NONE = 'none', +} diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 995712dfa..3acedd2ba 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -408,7 +408,7 @@ const convertOpenAIMessages = (convo) => { let currentId = ''; let lastId = null; - for (let message_id in mapping) { + for (const message_id in mapping) { const message = mapping[message_id]; currentId = message_id; try { @@ -442,7 +442,7 @@ const convertOpenAIMessages = (convo) => { } } - let history = {}; + const history: Record = {}; messages.forEach((obj) => (history[obj.id] = obj)); const chat = { @@ -481,7 +481,7 @@ const validateChat = (chat) => { } // Every message's content should be a string - for (let message of messages) { + for (const message of messages) { if (typeof message.content !== 'string') { return false; } @@ -494,7 +494,7 @@ export const convertOpenAIChats = (_chats) => { // Create a list of dictionaries with each conversation from import const chats = []; let failed = 0; - for (let convo of _chats) { + for (const convo of _chats) { const chat = convertOpenAIMessages(convo); if (validateChat(chat)) { @@ -513,7 +513,7 @@ export const convertOpenAIChats = (_chats) => { return chats; }; -export const isValidHttpUrl = (string) => { +export const isValidHttpUrl = (string: string) => { let url; try { @@ -525,7 +525,7 @@ export const isValidHttpUrl = (string) => { return url.protocol === 'http:' || url.protocol === 'https:'; }; -export const removeEmojis = (str) => { +export const removeEmojis = (str: string) => { // Regular expression to match emojis const emojiRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g; @@ -533,20 +533,24 @@ export const removeEmojis = (str) => { return str.replace(emojiRegex, ''); }; -export const removeFormattings = (str) => { +export const removeFormattings = (str: string) => { return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, ''); }; -export const extractSentences = (text) => { - // This regular expression matches code blocks marked by triple backticks - const codeBlockRegex = /```[\s\S]*?```/g; +export const prepareTextForTTS = (content: string) => { + return removeFormattings(removeEmojis(content.trim())); +}; - let codeBlocks = []; +// This regular expression matches code blocks marked by triple backticks +const codeBlockRegex = /```[\s\S]*?```/g; + +export const extractSentences = (text: string) => { + const codeBlocks: string[] = []; let index = 0; // Temporarily replace code blocks with placeholders and store the blocks separately text = text.replace(codeBlockRegex, (match) => { - let placeholder = `\u0000${index}\u0000`; // Use a unique placeholder + const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder codeBlocks[index++] = match; return placeholder; }); @@ -561,11 +565,36 @@ export const extractSentences = (text) => { }); return sentences - .map((sentence) => removeFormattings(removeEmojis(sentence.trim()))) - .filter((sentence) => sentence); + .map(prepareTextForTTS) + .filter(Boolean); }; -export const extractSentencesForAudio = (text) => { +export const extractParagraphsForAudio = (text: string) => { + const codeBlocks: string[] = []; + let index = 0; + + // Temporarily replace code blocks with placeholders and store the blocks separately + text = text.replace(codeBlockRegex, (match) => { + const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder + codeBlocks[index++] = match; + return placeholder; + }); + + // Split the modified text into paragraphs based on newlines, avoiding these blocks + let paragraphs = text.split(/\n+/); + + // Restore code blocks and process paragraphs + paragraphs = paragraphs.map((paragraph) => { + // Check if the paragraph includes a placeholder for a code block + return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]); + }); + + return paragraphs + .map(prepareTextForTTS) + .filter(Boolean); +}; + +export const extractSentencesForAudio = (text: string) => { return extractSentences(text).reduce((mergedTexts, currentText) => { const lastIndex = mergedTexts.length - 1; if (lastIndex >= 0) { @@ -580,7 +609,7 @@ export const extractSentencesForAudio = (text) => { mergedTexts.push(currentText); } return mergedTexts; - }, []); + }, [] as string[]); }; export const blobToFile = (blob, fileName) => {