From ba3fe33ef85166c12db4d1000722cdf34ea03c20 Mon Sep 17 00:00:00 2001 From: Juanan Pereira Date: Mon, 30 Dec 2024 13:53:08 +0100 Subject: [PATCH] feat: Add YouTube Video Ingestion Support in Knowledge Base subsystem --- .../open_webui/retrieval/loaders/youtube.py | 84 ++++++- backend/open_webui/routers/retrieval.py | 226 +++++++++++++++--- src/lib/apis/retrieval/index.ts | 5 +- .../workspace/Knowledge/KnowledgeBase.svelte | 54 ++++- .../KnowledgeBase/AddContentMenu.svelte | 10 + .../KnowledgeBase/AddYoutubeModal.svelte | 83 +++++++ src/lib/i18n/locales/en-US/translation.json | 4 +- 7 files changed, 427 insertions(+), 39 deletions(-) create mode 100644 src/lib/components/workspace/Knowledge/KnowledgeBase/AddYoutubeModal.svelte diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 8eb48488b..6fb4d2b43 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -69,6 +69,39 @@ class YoutubeLoader: else: self.language = language + def _get_video_title(self) -> Optional[str]: + """Get the video title using YouTube API or page scraping.""" + try: + import requests + import json + + # First try using YouTube Data API v3 if available + try: + from open_webui.config import YOUTUBE_API_KEY + if YOUTUBE_API_KEY: + url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + if data.get("items"): + return data["items"][0]["snippet"]["title"] + except ImportError: + pass + + # Fallback to scraping the title from YouTube page + url = f"https://www.youtube.com/watch?v={self.video_id}" + response = requests.get(url) + if response.status_code == 200: + import re + title_match = re.search(r'(.+?)', response.text) + if title_match: + title = title_match.group(1) + return title + return None + except Exception as e: + print(f"Error getting video title: {e}") + return None + def load(self) -> List[Document]: """Load YouTube transcripts into `Document` objects.""" try: @@ -102,16 +135,53 @@ class YoutubeLoader: return [] try: + # First try to get transcript in requested language transcript = transcript_list.find_transcript(self.language) except NoTranscriptFound: - transcript = transcript_list.find_transcript(["en"]) + # Fallback: try to get any available transcript + available_transcripts = list(transcript_list._generated_transcripts.values()) + if available_transcripts: + transcript = available_transcripts[0] + log.info(f"Using first available transcript in language: {transcript.language_code}") + else: + log.error("No transcripts found for video") + return [] transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - transcript = " ".join( - map( - lambda transcript_piece: transcript_piece["text"].strip(" "), - transcript_pieces, - ) + # Get video title and add it to base metadata + title = self._get_video_title() + if title: + self._metadata["title"] = title + + # Add the base video URL to metadata + base_url = f"https://www.youtube.com/watch?v={self.video_id}" + self._metadata["source_url"] = base_url + + # Combine pieces into a single text while tracking timestamp positions + full_text = "" + timestamp_map = [] + + for piece in transcript_pieces: + start_char = len(full_text) + text = piece["text"].strip() + full_text += text + " " + end_char = len(full_text) + + timestamp_map.append({ + "start": start_char, + "end": end_char, + "time": piece["start"], + "duration": piece["duration"] + }) + + # Create a single document that will be split by Langchain's text splitter + doc = Document( + page_content=full_text.strip(), + metadata={ + **self._metadata, + "timestamp_map": timestamp_map # Store timestamp mapping in metadata + } ) - return [Document(page_content=transcript, metadata=self._metadata)] + + return [doc] diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 2cffd9ead..721592f55 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -7,7 +7,7 @@ import shutil import uuid from datetime import datetime from pathlib import Path -from typing import Iterator, List, Optional, Sequence, Union +from typing import Iterator, Optional, Sequence, Union, List, Dict, Any, Tuple from fastapi import ( Depends, @@ -28,7 +28,9 @@ import tiktoken from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter from langchain_core.documents import Document -from open_webui.models.files import FileModel, Files +from urllib.parse import urlparse, parse_qs, urlencode, urlunparse + +from open_webui.models.files import FileModel, Files, FileForm from open_webui.models.knowledge import Knowledges from open_webui.storage.provider import Storage @@ -150,6 +152,14 @@ def get_rf( raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error")) return rf +def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str: + parsed = urlparse(url) + query_dict = parse_qs(parsed.query) + query_dict['t'] = [str(timestamp)] + new_query = urlencode(query_dict, doseq=True) + return urlunparse(parsed._replace(query=new_query)) + + ########################################## # @@ -652,6 +662,33 @@ async def update_query_settings( #################################### +def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]: + """ + Find the appropriate timestamp for a chunk based on its character position + Returns (start_time, end_time) as floats in seconds + """ + # Find the timestamp entry that contains the start of our chunk + for entry in timestamp_map: + if entry["start"] <= chunk_start <= entry["end"]: + start_time = entry["time"] + break + else: + # If not found, use the closest previous timestamp + start_time = min( + [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0) + + # Find the timestamp entry that contains the end of our chunk + for entry in reversed(timestamp_map): + if entry["start"] <= chunk_end <= entry["end"]: + end_time = entry["time"] + entry["duration"] + break + else: + # If not found, use the closest next timestamp + end_time = max([e["time"] + e["duration"] + for e in timestamp_map if e["end"] >= chunk_end], default=start_time) + + return start_time, end_time + def save_docs_to_vector_db( request: Request, docs, @@ -695,6 +732,14 @@ def save_docs_to_vector_db( raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT) if split: + # Check if this is a YouTube document by looking at the first doc's metadata + is_youtube = (len(docs) == 1 and + docs[0].metadata.get("type") == "youtube") + + # Store timestamp_map before splitting if it's a YouTube document + original_timestamp_map = docs[0].metadata.get( + "timestamp_map") if is_youtube else None + if request.app.state.config.TEXT_SPLITTER in ["", "character"]: text_splitter = RecursiveCharacterTextSplitter( chunk_size=request.app.state.config.CHUNK_SIZE, @@ -718,27 +763,64 @@ def save_docs_to_vector_db( docs = text_splitter.split_documents(docs) + # Only process timestamps for YouTube documents + if is_youtube and original_timestamp_map: + for doc in docs: + start_index = doc.metadata.get("start_index", 0) + end_index = start_index + len(doc.page_content) + + start_time, end_time = interpolate_timestamp( + start_index, + end_index, + original_timestamp_map + ) + + doc.metadata.update({ + "start_time": start_time, + "source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time)) + }) + + # Remove the timestamp_map from individual chunks + doc.metadata.pop("timestamp_map", None) + if len(docs) == 0: raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) texts = [doc.page_content for doc in docs] - metadatas = [ - { - **doc.metadata, - **(metadata if metadata else {}), - "embedding_config": json.dumps( - { - "engine": request.app.state.config.RAG_EMBEDDING_ENGINE, - "model": request.app.state.config.RAG_EMBEDDING_MODEL, - } - ), - } - for doc in docs - ] + metadatas = [] + + for doc in docs: + # Preserve the original metadata + doc_metadata = doc.metadata.copy() + + # Add any additional metadata + if metadata: + doc_metadata.update(metadata) + + # Ensure source and source_url are preserved + if "source_url" in doc_metadata: + doc_metadata["source"] = doc_metadata["source_url"] + + # Add embedding config + doc_metadata["embedding_config"] = json.dumps( + { + "engine": request.app.state.config.RAG_EMBEDDING_ENGINE, + "model": request.app.state.config.RAG_EMBEDDING_MODEL, + } + ) + + # Convert datetime objects to strings + for key, value in doc_metadata.items(): + if isinstance(value, datetime): + doc_metadata[key] = str(value) + + # Debug log for final metadata + log.info(f"Final document metadata for ChromaDB: {doc_metadata}") + metadatas.append(doc_metadata) - # ChromaDB does not like datetime formats - # for meta-data so convert them to string. for metadata in metadatas: + # ChromaDB does not like datetime formats + # for meta-data so convert them to string. for key, value in metadata.items(): if isinstance(value, datetime): metadata[key] = str(value) @@ -803,6 +885,8 @@ class ProcessFileForm(BaseModel): file_id: str content: Optional[str] = None collection_name: Optional[str] = None + type: Optional[str] = "file" # Default to 'file' if not specified + url: Optional[str] = None # URL for web content @router.post("/process/file") @@ -813,12 +897,41 @@ def process_file( ): try: file = Files.get_file_by_id(form_data.file_id) + content = file.data.get("content", "") + + # Create base metadata + metadata = { + **file.meta, # Original file metadata + "name": file.filename, + "created_by": file.user_id, + "file_id": file.id, + "source": file.filename, + } + + # For YouTube content, we skip embedding but still process the file association + if "type" in metadata and metadata["type"] == "youtube": + log.info("Processing YouTube content - skipping embedding") + return { + "status": True, + "collection_name": form_data.collection_name, + "content": content, + "file": { + "id": file.id, + "meta": metadata + } + } collection_name = form_data.collection_name if collection_name is None: collection_name = f"file-{file.id}" + # Get the document type, default to 'file' if not specified + doc_type = form_data.type if form_data.type else "file" + + # Get source URL if available + source = form_data.url if form_data.url else file.filename + if form_data.content: # Update the content in the file # Usage: /files/{file_id}/data/content/update @@ -833,11 +946,11 @@ def process_file( "name": file.filename, "created_by": file.user_id, "file_id": file.id, - "source": file.filename, + "source": source, + "type": doc_type, }, ) ] - text_content = form_data.content elif form_data.collection_name: # Check if the file has already been processed and save the content @@ -851,7 +964,11 @@ def process_file( docs = [ Document( page_content=result.documents[0][idx], - metadata=result.metadatas[0][idx], + metadata={ + **result.metadatas[0][idx], + "type": doc_type, + "source": source, + }, ) for idx, id in enumerate(result.ids[0]) ] @@ -864,7 +981,8 @@ def process_file( "name": file.filename, "created_by": file.user_id, "file_id": file.id, - "source": file.filename, + "source": source, + "type": doc_type, }, ) ] @@ -893,7 +1011,8 @@ def process_file( "name": file.filename, "created_by": file.user_id, "file_id": file.id, - "source": file.filename, + "source": source, + "type": doc_type, }, ) for doc in docs @@ -907,7 +1026,8 @@ def process_file( "name": file.filename, "created_by": file.user_id, "file_id": file.id, - "source": file.filename, + "source": source, + "type": doc_type, }, ) ] @@ -919,6 +1039,11 @@ def process_file( {"content": text_content}, ) + Files.update_file_data_by_id( + file.id, + {"content": text_content}, + ) + hash = calculate_sha256_string(text_content) Files.update_file_hash_by_id(file.id, hash) @@ -1023,19 +1148,64 @@ def process_youtube_video( content = " ".join([doc.page_content for doc in docs]) log.debug(f"text_content: {content}") - save_docs_to_vector_db(request, docs, collection_name, overwrite=True) + # Get video title from metadata or fallback to URL + video_title = docs[0].metadata.get("title", form_data.url) + + # Create a unique file ID for this video + file_id = str(uuid.uuid4()) + + # Create a file record + file_item = Files.insert_new_file( + user.id if user else None, + FileForm( + **{ + "id": file_id, + "filename": video_title, + "path": form_data.url, # Use the video URL as the path + "meta": { + "name": video_title, + "content_type": "text/plain", + "size": len(content), + "source": form_data.url, + "source_url": add_timestamp_to_youtube_url(form_data.url, 0), + "type": "youtube" + }, + "data": { + "content": content + } + } + ), + ) + + # Add file-specific metadata + file_metadata = { + "source": form_data.url, + "source_url": add_timestamp_to_youtube_url(form_data.url, 0), + "title": video_title, + "type": "youtube", + "name": video_title, + "file_id": file_id, + "created_by": user.id if user else None + } + + # Update all docs with the file metadata + for doc in docs: + doc.metadata.update(file_metadata) + # Debug log + log.info(f"Document metadata before saving: {doc.metadata}") + + save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True) return { "status": True, "collection_name": collection_name, - "filename": form_data.url, + "id": file_id, # Return the file ID directly + "filename": video_title, "file": { "data": { "content": content, }, - "meta": { - "name": form_data.url, - }, + "meta": file_metadata }, } except Exception as e: diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts index c35c37847..07584d3ff 100644 --- a/src/lib/apis/retrieval/index.ts +++ b/src/lib/apis/retrieval/index.ts @@ -344,7 +344,7 @@ export const processFile = async ( return res; }; -export const processYoutubeVideo = async (token: string, url: string) => { +export const processYoutubeVideo = async (token: string, url: string, collection_name: string) => { let error = null; const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/youtube`, { @@ -355,7 +355,8 @@ export const processYoutubeVideo = async (token: string, url: string) => { authorization: `Bearer ${token}` }, body: JSON.stringify({ - url: url + url: url, + collection_name: collection_name }) }) .then(async (res) => { diff --git a/src/lib/components/workspace/Knowledge/KnowledgeBase.svelte b/src/lib/components/workspace/Knowledge/KnowledgeBase.svelte index cabcff571..614ab7366 100644 --- a/src/lib/components/workspace/Knowledge/KnowledgeBase.svelte +++ b/src/lib/components/workspace/Knowledge/KnowledgeBase.svelte @@ -24,7 +24,7 @@ import { transcribeAudio } from '$lib/apis/audio'; import { blobToFile } from '$lib/utils'; - import { processFile } from '$lib/apis/retrieval'; + import { processFile, processYoutubeVideo } from '$lib/apis/retrieval'; import Spinner from '$lib/components/common/Spinner.svelte'; import Files from './KnowledgeBase/Files.svelte'; @@ -32,6 +32,8 @@ import AddContentMenu from './KnowledgeBase/AddContentMenu.svelte'; import AddTextContentModal from './KnowledgeBase/AddTextContentModal.svelte'; + import AddYoutubeModal from './KnowledgeBase/AddYoutubeModal.svelte'; + import SyncConfirmDialog from '../../common/ConfirmDialog.svelte'; import RichTextInput from '$lib/components/common/RichTextInput.svelte'; @@ -64,6 +66,7 @@ let showAddTextContentModal = false; let showSyncConfirmModal = false; let showAccessControlModal = false; + let showAddYoutubeModal = false; let inputFiles = null; @@ -584,6 +587,53 @@ }} /> + { + const url = e.detail.url; + + // Create a temporary file entry + const tempItemId = uuidv4(); + const fileItem = { + type: 'youtube', + file: '', + id: null, + url: url, + name: url, // We'll update this with video title later + size: 0, + status: 'uploading', + error: '', + itemId: tempItemId + }; + + knowledge.files = [...(knowledge.files ?? []), fileItem]; + + // Process the YouTube video with knowledge base ID as collection + const res = await processYoutubeVideo(localStorage.token, url, id).catch((err) => { + toast.error(err); + return null; + }); + + if (res) { + // Add file to knowledge base using the ID from the response + const updatedKnowledge = await addFileToKnowledgeById(localStorage.token, id, res.id).catch((e) => { + toast.error(e); + return null; + }); + + if (updatedKnowledge) { + knowledge = updatedKnowledge; + toast.success($i18n.t('YouTube video processed successfully.')); + } else { + toast.error($i18n.t('Failed to add video to knowledge base.')); + knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId); + } + } else { + knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId); + } + }} +/> +
{$i18n.t('Add text content')}
+ { + dispatch('upload', { type: 'youtube' }); + }} + > + +
{$i18n.t('Add YouTube URL')}
+
diff --git a/src/lib/components/workspace/Knowledge/KnowledgeBase/AddYoutubeModal.svelte b/src/lib/components/workspace/Knowledge/KnowledgeBase/AddYoutubeModal.svelte new file mode 100644 index 000000000..3244e1eb0 --- /dev/null +++ b/src/lib/components/workspace/Knowledge/KnowledgeBase/AddYoutubeModal.svelte @@ -0,0 +1,83 @@ + + + +
+ +
+ +
+
{ + if (!url.trim()) { + toast.error($i18n.t('Please enter a YouTube URL.')); + return; + } + + if (!isValidYoutubeUrl(url.trim())) { + toast.error($i18n.t('Please enter a valid YouTube URL.')); + return; + } + + dispatch('submit', { url: url.trim() }); + show = false; + url = ''; + }} + > +
+
+

{$i18n.t('Add YouTube Video')}

+
+ +
+
+
+ +
+
+ + + +
+
+
+
+
\ No newline at end of file diff --git a/src/lib/i18n/locales/en-US/translation.json b/src/lib/i18n/locales/en-US/translation.json index 6afefe5d7..dedaa7ec4 100644 --- a/src/lib/i18n/locales/en-US/translation.json +++ b/src/lib/i18n/locales/en-US/translation.json @@ -411,6 +411,7 @@ "Export Tools": "", "External Models": "", "Failed to add file.": "", + "Failed to add video to knowledge base.":"", "Failed to create API Key.": "", "Failed to read clipboard contents": "", "Failed to save models configuration": "", @@ -1061,5 +1062,6 @@ "Your account status is currently pending activation.": "", "Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "", "Youtube": "", - "Youtube Loader Settings": "" + "Youtube Loader Settings": "", + "YouTube video processed successfully": "" }