From cbd18ec63c3a540ae4d7dfb63d216ad1e26ba12f Mon Sep 17 00:00:00 2001
From: "Timothy J. Baek" <timothyjrbeck@gmail.com>
Date: Sat, 20 Apr 2024 16:00:24 -0500
Subject: [PATCH] feat: external openai tts support

---
 backend/apps/audio/main.py                    | 92 +++++++++----------
 src/lib/apis/audio/index.ts                   | 78 +++++++++++++++-
 .../chat/Messages/ResponseMessage.svelte      | 13 ++-
 src/lib/components/chat/Settings/Audio.svelte | 60 ++++++++++--
 .../documents/Settings/General.svelte         | 18 ++--
 5 files changed, 187 insertions(+), 74 deletions(-)

diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py
index f7ce6fecd..2bee38c5a 100644
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -101,61 +101,57 @@ async def update_openai_config(
 
 @app.post("/speech")
 async def speech(request: Request, user=Depends(get_verified_user)):
-    idx = None
+    body = await request.body()
+    name = hashlib.sha256(body).hexdigest()
+
+    file_path = SPEECH_CACHE_DIR.joinpath(f"{name}.mp3")
+    file_body_path = SPEECH_CACHE_DIR.joinpath(f"{name}.json")
+
+    # Check if the file already exists in the cache
+    if file_path.is_file():
+        return FileResponse(file_path)
+
+    headers = {}
+    headers["Authorization"] = f"Bearer {app.state.OPENAI_API_KEY}"
+    headers["Content-Type"] = "application/json"
+
+    r = None
     try:
-        body = await request.body()
-        name = hashlib.sha256(body).hexdigest()
+        r = requests.post(
+            url=f"{app.state.OPENAI_API_BASE_URL}/audio/speech",
+            data=body,
+            headers=headers,
+            stream=True,
+        )
 
-        file_path = SPEECH_CACHE_DIR.joinpath(f"{name}.mp3")
-        file_body_path = SPEECH_CACHE_DIR.joinpath(f"{name}.json")
+        r.raise_for_status()
 
-        # Check if the file already exists in the cache
-        if file_path.is_file():
-            return FileResponse(file_path)
+        # Save the streaming content to a file
+        with open(file_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
 
-        headers = {}
-        headers["Authorization"] = f"Bearer {app.state.OPENAI_API_KEY}"
-        headers["Content-Type"] = "application/json"
+        with open(file_body_path, "w") as f:
+            json.dump(json.loads(body.decode("utf-8")), f)
 
-        r = None
-        try:
-            r = requests.post(
-                url=f"{app.state.OPENAI_API_BASE_URL}/audio/speech",
-                data=body,
-                headers=headers,
-                stream=True,
-            )
+        # Return the saved file
+        return FileResponse(file_path)
 
-            r.raise_for_status()
+    except Exception as e:
+        log.exception(e)
+        error_detail = "Open WebUI: Server Connection Error"
+        if r is not None:
+            try:
+                res = r.json()
+                if "error" in res:
+                    error_detail = f"External: {res['error']['message']}"
+            except:
+                error_detail = f"External: {e}"
 
-            # Save the streaming content to a file
-            with open(file_path, "wb") as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-
-            with open(file_body_path, "w") as f:
-                json.dump(json.loads(body.decode("utf-8")), f)
-
-            # Return the saved file
-            return FileResponse(file_path)
-
-        except Exception as e:
-            log.exception(e)
-            error_detail = "Open WebUI: Server Connection Error"
-            if r is not None:
-                try:
-                    res = r.json()
-                    if "error" in res:
-                        error_detail = f"External: {res['error']}"
-                except:
-                    error_detail = f"External: {e}"
-
-            raise HTTPException(
-                status_code=r.status_code if r else 500, detail=error_detail
-            )
-
-    except ValueError:
-        raise HTTPException(status_code=401, detail=ERROR_MESSAGES.OPENAI_NOT_FOUND)
+        raise HTTPException(
+            status_code=r.status_code if r != None else 500,
+            detail=error_detail,
+        )
 
 
 @app.post("/transcriptions")
diff --git a/src/lib/apis/audio/index.ts b/src/lib/apis/audio/index.ts
index 1919d0ee7..6679420d9 100644
--- a/src/lib/apis/audio/index.ts
+++ b/src/lib/apis/audio/index.ts
@@ -1,5 +1,67 @@
 import { AUDIO_API_BASE_URL } from '$lib/constants';
 
+export const getAudioConfig = async (token: string) => {
+	let error = null;
+
+	const res = await fetch(`${AUDIO_API_BASE_URL}/config`, {
+		method: 'GET',
+		headers: {
+			'Content-Type': 'application/json',
+			Authorization: `Bearer ${token}`
+		}
+	})
+		.then(async (res) => {
+			if (!res.ok) throw await res.json();
+			return res.json();
+		})
+		.catch((err) => {
+			console.log(err);
+			error = err.detail;
+			return null;
+		});
+
+	if (error) {
+		throw error;
+	}
+
+	return res;
+};
+
+type OpenAIConfigForm = {
+	url: string;
+	key: string;
+};
+
+export const updateAudioConfig = async (token: string, payload: OpenAIConfigForm) => {
+	let error = null;
+
+	const res = await fetch(`${AUDIO_API_BASE_URL}/config/update`, {
+		method: 'POST',
+		headers: {
+			'Content-Type': 'application/json',
+			Authorization: `Bearer ${token}`
+		},
+		body: JSON.stringify({
+			...payload
+		})
+	})
+		.then(async (res) => {
+			if (!res.ok) throw await res.json();
+			return res.json();
+		})
+		.catch((err) => {
+			console.log(err);
+			error = err.detail;
+			return null;
+		});
+
+	if (error) {
+		throw error;
+	}
+
+	return res;
+};
+
 export const transcribeAudio = async (token: string, file: File) => {
 	const data = new FormData();
 	data.append('file', file);
@@ -48,11 +110,17 @@ export const synthesizeOpenAISpeech = async (
 			input: text,
 			voice: speaker
 		})
-	}).catch((err) => {
-		console.log(err);
-		error = err;
-		return null;
-	});
+	})
+		.then(async (res) => {
+			if (!res.ok) throw await res.json();
+			return res;
+		})
+		.catch((err) => {
+			error = err.detail;
+			console.log(err);
+
+			return null;
+		});
 
 	if (error) {
 		throw error;
diff --git a/src/lib/components/chat/Messages/ResponseMessage.svelte b/src/lib/components/chat/Messages/ResponseMessage.svelte
index fd2de7273..db8eba169 100644
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@@ -176,10 +176,12 @@
 
 	const toggleSpeakMessage = async () => {
 		if (speaking) {
-			speechSynthesis.cancel();
+			try {
+				speechSynthesis.cancel();
 
-			sentencesAudio[speakingIdx].pause();
-			sentencesAudio[speakingIdx].currentTime = 0;
+				sentencesAudio[speakingIdx].pause();
+				sentencesAudio[speakingIdx].currentTime = 0;
+			} catch {}
 
 			speaking = null;
 			speakingIdx = null;
@@ -221,6 +223,10 @@
 						sentence
 					).catch((error) => {
 						toast.error(error);
+
+						speaking = null;
+						loadingSpeech = false;
+
 						return null;
 					});
 
@@ -230,7 +236,6 @@
 						const audio = new Audio(blobUrl);
 						sentencesAudio[idx] = audio;
 						loadingSpeech = false;
-
 						lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
 					}
 				}
diff --git a/src/lib/components/chat/Settings/Audio.svelte b/src/lib/components/chat/Settings/Audio.svelte
index b6a8a859a..9a38d7ab0 100644
--- a/src/lib/components/chat/Settings/Audio.svelte
+++ b/src/lib/components/chat/Settings/Audio.svelte
@@ -1,4 +1,5 @@
 <script lang="ts">
+	import { getAudioConfig, updateAudioConfig } from '$lib/apis/audio';
 	import { createEventDispatcher, onMount, getContext } from 'svelte';
 	import { toast } from 'svelte-sonner';
 	const dispatch = createEventDispatcher();
@@ -9,6 +10,9 @@
 
 	// Audio
 
+	let OpenAIUrl = '';
+	let OpenAIKey = '';
+
 	let STTEngines = ['', 'openai'];
 	let STTEngine = '';
 
@@ -69,6 +73,18 @@
 		saveSettings({ speechAutoSend: speechAutoSend });
 	};
 
+	const updateConfigHandler = async () => {
+		const res = await updateAudioConfig(localStorage.token, {
+			url: OpenAIUrl,
+			key: OpenAIKey
+		});
+
+		if (res) {
+			OpenAIUrl = res.OPENAI_API_BASE_URL;
+			OpenAIKey = res.OPENAI_API_KEY;
+		}
+	};
+
 	onMount(async () => {
 		let settings = JSON.parse(localStorage.getItem('settings') ?? '{}');
 
@@ -85,12 +101,20 @@
 		} else {
 			getWebAPIVoices();
 		}
+
+		const res = await getAudioConfig(localStorage.token);
+
+		if (res) {
+			OpenAIUrl = res.OPENAI_API_BASE_URL;
+			OpenAIKey = res.OPENAI_API_KEY;
+		}
 	});
 </script>
 
 <form
 	class="flex flex-col h-full justify-between space-y-3 text-sm"
-	on:submit|preventDefault={() => {
+	on:submit|preventDefault={async () => {
+		await updateConfigHandler();
 		saveSettings({
 			audio: {
 				STTEngine: STTEngine !== '' ? STTEngine : undefined,
@@ -101,7 +125,7 @@
 		dispatch('save');
 	}}
 >
-	<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80">
+	<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-[22rem]">
 		<div>
 			<div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
 
@@ -196,6 +220,24 @@
 				</div>
 			</div>
 
+			{#if TTSEngine === 'openai'}
+				<div class="mt-1 flex gap-2 mb-1">
+					<input
+						class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
+						placeholder={$i18n.t('API Base URL')}
+						bind:value={OpenAIUrl}
+						required
+					/>
+
+					<input
+						class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
+						placeholder={$i18n.t('API Key')}
+						bind:value={OpenAIKey}
+						required
+					/>
+				</div>
+			{/if}
+
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">{$i18n.t('Auto-playback response')}</div>
 
@@ -241,16 +283,18 @@
 				<div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
 				<div class="flex w-full">
 					<div class="flex-1">
-						<select
-							class="w-full rounded py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none"
+						<input
+							list="voice-list"
+							class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
 							bind:value={speaker}
 							placeholder="Select a voice"
-						>
+						/>
+
+						<datalist id="voice-list">
 							{#each voices as voice}
-								<option value={voice.name} class="bg-gray-100 dark:bg-gray-700">{voice.name}</option
-								>
+								<option value={voice.name} />
 							{/each}
-						</select>
+						</datalist>
 					</div>
 				</div>
 			</div>
diff --git a/src/lib/components/documents/Settings/General.svelte b/src/lib/components/documents/Settings/General.svelte
index 18c501340..a2bbec852 100644
--- a/src/lib/components/documents/Settings/General.svelte
+++ b/src/lib/components/documents/Settings/General.svelte
@@ -29,8 +29,8 @@
 	let embeddingEngine = '';
 	let embeddingModel = '';
 
-	let openAIKey = '';
-	let openAIUrl = '';
+	let OpenAIKey = '';
+	let OpenAIUrl = '';
 
 	let chunkSize = 0;
 	let chunkOverlap = 0;
@@ -79,7 +79,7 @@
 			return;
 		}
 
-		if ((embeddingEngine === 'openai' && openAIKey === '') || openAIUrl === '') {
+		if ((embeddingEngine === 'openai' && OpenAIKey === '') || OpenAIUrl === '') {
 			toast.error($i18n.t('OpenAI URL/Key required.'));
 			return;
 		}
@@ -93,8 +93,8 @@
 			...(embeddingEngine === 'openai'
 				? {
 						openai_config: {
-							key: openAIKey,
-							url: openAIUrl
+							key: OpenAIKey,
+							url: OpenAIUrl
 						}
 				  }
 				: {})
@@ -133,8 +133,8 @@
 			embeddingEngine = embeddingConfig.embedding_engine;
 			embeddingModel = embeddingConfig.embedding_model;
 
-			openAIKey = embeddingConfig.openai_config.key;
-			openAIUrl = embeddingConfig.openai_config.url;
+			OpenAIKey = embeddingConfig.openai_config.key;
+			OpenAIUrl = embeddingConfig.openai_config.url;
 		}
 	};
 
@@ -192,14 +192,14 @@
 					<input
 						class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
 						placeholder={$i18n.t('API Base URL')}
-						bind:value={openAIUrl}
+						bind:value={OpenAIUrl}
 						required
 					/>
 
 					<input
 						class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
 						placeholder={$i18n.t('API Key')}
-						bind:value={openAIKey}
+						bind:value={OpenAIKey}
 						required
 					/>
 				</div>