feat: Add YouTube Video Ingestion Support in Knowledge Base subsystem

2025-04-10 15:45:45 +00:00 · 2024-12-30 13:53:08 +01:00 · 2024-12-30 13:53:08 +01:00 · ba3fe33ef8
commit ba3fe33ef8
parent f6a54c96bc
7 changed files with 427 additions and 39 deletions
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@ -69,6 +69,39 @@ class YoutubeLoader:
        else:
            self.language = language

+    def _get_video_title(self) -> Optional[str]:
+        """Get the video title using YouTube API or page scraping."""
+        try:
+            import requests
+            import json
+
+            # First try using YouTube Data API v3 if available
+            try:
+                from open_webui.config import YOUTUBE_API_KEY
+                if YOUTUBE_API_KEY:
+                    url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
+                    response = requests.get(url)
+                    if response.status_code == 200:
+                        data = response.json()
+                        if data.get("items"):
+                            return data["items"][0]["snippet"]["title"]
+            except ImportError:
+                pass
+
+            # Fallback to scraping the title from YouTube page
+            url = f"https://www.youtube.com/watch?v={self.video_id}"
+            response = requests.get(url)
+            if response.status_code == 200:
+                import re
+                title_match = re.search(r'<title>(.+?)</title>', response.text)
+                if title_match:
+                    title = title_match.group(1)
+                    return title
+            return None
+        except Exception as e:
+            print(f"Error getting video title: {e}")
+            return None
+
    def load(self) -> List[Document]:
        """Load YouTube transcripts into `Document` objects."""
        try:
@ -102,16 +135,53 @@ class YoutubeLoader:
            return []

        try:
+            # First try to get transcript in requested language
            transcript = transcript_list.find_transcript(self.language)
        except NoTranscriptFound:
-            transcript = transcript_list.find_transcript(["en"])
+            # Fallback: try to get any available transcript
+            available_transcripts = list(transcript_list._generated_transcripts.values())
+            if available_transcripts:
+                transcript = available_transcripts[0]
+                log.info(f"Using first available transcript in language: {transcript.language_code}")
+            else:
+                log.error("No transcripts found for video")
+                return []

        transcript_pieces: List[Dict[str, Any]] = transcript.fetch()

-        transcript = " ".join(
-            map(
-                lambda transcript_piece: transcript_piece["text"].strip(" "),
-                transcript_pieces,
-            )
+        # Get video title and add it to base metadata
+        title = self._get_video_title()
+        if title:
+            self._metadata["title"] = title
+
+        # Add the base video URL to metadata
+        base_url = f"https://www.youtube.com/watch?v={self.video_id}"
+        self._metadata["source_url"] = base_url
+
+        # Combine pieces into a single text while tracking timestamp positions
+        full_text = ""
+        timestamp_map = []
+        
+        for piece in transcript_pieces:
+            start_char = len(full_text)
+            text = piece["text"].strip()
+            full_text += text + " "
+            end_char = len(full_text)
+            
+            timestamp_map.append({
+                "start": start_char,
+                "end": end_char,
+                "time": piece["start"],
+                "duration": piece["duration"]
+            })
+
+        # Create a single document that will be split by Langchain's text splitter
+        doc = Document(
+            page_content=full_text.strip(),
+            metadata={
+                **self._metadata,
+                "timestamp_map": timestamp_map  # Store timestamp mapping in metadata
+            }
        )
-        return [Document(page_content=transcript, metadata=self._metadata)]
+
+        return [doc]
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@ -7,7 +7,7 @@ import shutil
 import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Iterator, List, Optional, Sequence, Union
+from typing import Iterator, Optional, Sequence, Union, List, Dict, Any, Tuple

 from fastapi import (
    Depends,
@ -28,7 +28,9 @@ import tiktoken
 from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
 from langchain_core.documents import Document

-from open_webui.models.files import FileModel, Files
+from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
+
+from open_webui.models.files import FileModel, Files, FileForm
 from open_webui.models.knowledge import Knowledges
 from open_webui.storage.provider import Storage

@ -150,6 +152,14 @@ def get_rf(
                raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
    return rf

+def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
+    parsed = urlparse(url)
+    query_dict = parse_qs(parsed.query)
+    query_dict['t'] = [str(timestamp)]
+    new_query = urlencode(query_dict, doseq=True)
+    return urlunparse(parsed._replace(query=new_query))
+
+

 ##########################################
 #
@ -652,6 +662,33 @@ async def update_query_settings(
 ####################################


+def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
+    """
+    Find the appropriate timestamp for a chunk based on its character position
+    Returns (start_time, end_time) as floats in seconds
+    """
+    # Find the timestamp entry that contains the start of our chunk
+    for entry in timestamp_map:
+        if entry["start"] <= chunk_start <= entry["end"]:
+            start_time = entry["time"]
+            break
+    else:
+        # If not found, use the closest previous timestamp
+        start_time = min(
+            [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
+
+    # Find the timestamp entry that contains the end of our chunk
+    for entry in reversed(timestamp_map):
+        if entry["start"] <= chunk_end <= entry["end"]:
+            end_time = entry["time"] + entry["duration"]
+            break
+    else:
+        # If not found, use the closest next timestamp
+        end_time = max([e["time"] + e["duration"]
+                       for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
+
+    return start_time, end_time
+
 def save_docs_to_vector_db(
    request: Request,
    docs,
@ -695,6 +732,14 @@ def save_docs_to_vector_db(
                raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)

    if split:
+        # Check if this is a YouTube document by looking at the first doc's metadata
+        is_youtube = (len(docs) == 1 and
+                      docs[0].metadata.get("type") == "youtube")
+
+        # Store timestamp_map before splitting if it's a YouTube document
+        original_timestamp_map = docs[0].metadata.get(
+            "timestamp_map") if is_youtube else None
+
        if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=request.app.state.config.CHUNK_SIZE,
@ -718,27 +763,64 @@ def save_docs_to_vector_db(

        docs = text_splitter.split_documents(docs)

+        # Only process timestamps for YouTube documents
+        if is_youtube and original_timestamp_map:
+            for doc in docs:
+                start_index = doc.metadata.get("start_index", 0)
+                end_index = start_index + len(doc.page_content)
+
+                start_time, end_time = interpolate_timestamp(
+                    start_index,
+                    end_index,
+                    original_timestamp_map
+                )
+
+                doc.metadata.update({
+                    "start_time": start_time,
+                    "source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
+                })
+
+                # Remove the timestamp_map from individual chunks
+                doc.metadata.pop("timestamp_map", None)
+
    if len(docs) == 0:
        raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)

    texts = [doc.page_content for doc in docs]
-    metadatas = [
-        {
-            **doc.metadata,
-            **(metadata if metadata else {}),
-            "embedding_config": json.dumps(
-                {
-                    "engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
-                    "model": request.app.state.config.RAG_EMBEDDING_MODEL,
-                }
-            ),
-        }
-        for doc in docs
-    ]
+    metadatas = []
+
+    for doc in docs:
+        # Preserve the original metadata
+        doc_metadata = doc.metadata.copy()
+
+        # Add any additional metadata
+        if metadata:
+            doc_metadata.update(metadata)
+
+        # Ensure source and source_url are preserved
+        if "source_url" in doc_metadata:
+            doc_metadata["source"] = doc_metadata["source_url"]
+
+        # Add embedding config
+        doc_metadata["embedding_config"] = json.dumps(
+            {
+                "engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
+                "model": request.app.state.config.RAG_EMBEDDING_MODEL,
+            }
+        )
+
+        # Convert datetime objects to strings
+        for key, value in doc_metadata.items():
+            if isinstance(value, datetime):
+                doc_metadata[key] = str(value)
+
+        # Debug log for final metadata
+        log.info(f"Final document metadata for ChromaDB: {doc_metadata}")
+        metadatas.append(doc_metadata)

-    # ChromaDB does not like datetime formats
-    # for meta-data so convert them to string.
    for metadata in metadatas:
+        # ChromaDB does not like datetime formats
+        # for meta-data so convert them to string.
        for key, value in metadata.items():
            if isinstance(value, datetime):
                metadata[key] = str(value)
@ -803,6 +885,8 @@ class ProcessFileForm(BaseModel):
    file_id: str
    content: Optional[str] = None
    collection_name: Optional[str] = None
+    type: Optional[str] = "file"  # Default to 'file' if not specified
+    url: Optional[str] = None  # URL for web content


@router.post("/process/file")
@ -813,12 +897,41 @@ def process_file(
 ):
    try:
        file = Files.get_file_by_id(form_data.file_id)
+        content = file.data.get("content", "")
+
+        # Create base metadata
+        metadata = {
+            **file.meta,  # Original file metadata
+            "name": file.filename,
+            "created_by": file.user_id,
+            "file_id": file.id,
+            "source": file.filename,
+        }
+
+        # For YouTube content, we skip embedding but still process the file association
+        if "type" in metadata and metadata["type"] == "youtube":
+            log.info("Processing YouTube content - skipping embedding")
+            return {
+                "status": True,
+                "collection_name": form_data.collection_name,
+                "content": content,
+                "file": {
+                    "id": file.id,
+                    "meta": metadata
+                }
+            }

        collection_name = form_data.collection_name

        if collection_name is None:
            collection_name = f"file-{file.id}"

+        # Get the document type, default to 'file' if not specified
+        doc_type = form_data.type if form_data.type else "file"
+
+        # Get source URL if available
+        source = form_data.url if form_data.url else file.filename
+
        if form_data.content:
            # Update the content in the file
            # Usage: /files/{file_id}/data/content/update
@ -833,11 +946,11 @@ def process_file(
                        "name": file.filename,
                        "created_by": file.user_id,
                        "file_id": file.id,
-                        "source": file.filename,
+                        "source": source,
+                        "type": doc_type,
                    },
                )
            ]
-
            text_content = form_data.content
        elif form_data.collection_name:
            # Check if the file has already been processed and save the content
@ -851,7 +964,11 @@ def process_file(
                docs = [
                    Document(
                        page_content=result.documents[0][idx],
-                        metadata=result.metadatas[0][idx],
+                        metadata={
+                            **result.metadatas[0][idx],
+                            "type": doc_type,
+                            "source": source,
+                        },
                    )
                    for idx, id in enumerate(result.ids[0])
                ]
@ -864,7 +981,8 @@ def process_file(
                            "name": file.filename,
                            "created_by": file.user_id,
                            "file_id": file.id,
-                            "source": file.filename,
+                            "source": source,
+                            "type": doc_type,
                        },
                    )
                ]
@ -893,7 +1011,8 @@ def process_file(
                            "name": file.filename,
                            "created_by": file.user_id,
                            "file_id": file.id,
-                            "source": file.filename,
+                            "source": source,
+                            "type": doc_type,
                        },
                    )
                    for doc in docs
@ -907,7 +1026,8 @@ def process_file(
                            "name": file.filename,
                            "created_by": file.user_id,
                            "file_id": file.id,
-                            "source": file.filename,
+                            "source": source,
+                            "type": doc_type,
                        },
                    )
                ]
@ -919,6 +1039,11 @@ def process_file(
            {"content": text_content},
        )

+        Files.update_file_data_by_id(
+            file.id,
+            {"content": text_content},
+        )
+
        hash = calculate_sha256_string(text_content)
        Files.update_file_hash_by_id(file.id, hash)

@ -1023,19 +1148,64 @@ def process_youtube_video(
        content = " ".join([doc.page_content for doc in docs])
        log.debug(f"text_content: {content}")

-        save_docs_to_vector_db(request, docs, collection_name, overwrite=True)
+                # Get video title from metadata or fallback to URL
+        video_title = docs[0].metadata.get("title", form_data.url)
+
+        # Create a unique file ID for this video
+        file_id = str(uuid.uuid4())
+
+        # Create a file record
+        file_item = Files.insert_new_file(
+            user.id if user else None,
+            FileForm(
+                **{
+                    "id": file_id,
+                    "filename": video_title,
+                    "path": form_data.url,  # Use the video URL as the path
+                    "meta": {
+                        "name": video_title,
+                        "content_type": "text/plain",
+                        "size": len(content),
+                        "source": form_data.url,
+                        "source_url": add_timestamp_to_youtube_url(form_data.url, 0),
+                        "type": "youtube"
+                    },
+                    "data": {
+                        "content": content
+                    }
+                }
+            ),
+        )
+
+        # Add file-specific metadata
+        file_metadata = {
+            "source": form_data.url,
+            "source_url": add_timestamp_to_youtube_url(form_data.url, 0),
+            "title": video_title,
+            "type": "youtube",
+            "name": video_title,
+            "file_id": file_id,
+            "created_by": user.id if user else None
+        }
+
+        # Update all docs with the file metadata
+        for doc in docs:
+            doc.metadata.update(file_metadata)
+            # Debug log
+            log.info(f"Document metadata before saving: {doc.metadata}")
+
+        save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)

        return {
            "status": True,
            "collection_name": collection_name,
-            "filename": form_data.url,
+            "id": file_id,  # Return the file ID directly
+             "filename": video_title,
            "file": {
                "data": {
                    "content": content,
                },
-                "meta": {
-                    "name": form_data.url,
-                },
+                "meta": file_metadata
            },
        }
    except Exception as e:
--- a/src/lib/apis/retrieval/index.ts
+++ b/src/lib/apis/retrieval/index.ts
@ -344,7 +344,7 @@ export const processFile = async (
 	return res;
 };

-export const processYoutubeVideo = async (token: string, url: string) => {
+export const processYoutubeVideo = async (token: string, url: string, collection_name: string) => {
 	let error = null;

 	const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/youtube`, {
@ -355,7 +355,8 @@ export const processYoutubeVideo = async (token: string, url: string) => {
 			authorization: `Bearer ${token}`
 		},
 		body: JSON.stringify({
-			url: url
+			url: url,
+			collection_name: collection_name
 		})
 	})
 		.then(async (res) => {
--- a/src/lib/components/workspace/Knowledge/KnowledgeBase.svelte
+++ b/src/lib/components/workspace/Knowledge/KnowledgeBase.svelte
@ -24,7 +24,7 @@

 	import { transcribeAudio } from '$lib/apis/audio';
 	import { blobToFile } from '$lib/utils';
-	import { processFile } from '$lib/apis/retrieval';
+	import { processFile, processYoutubeVideo } from '$lib/apis/retrieval';

 	import Spinner from '$lib/components/common/Spinner.svelte';
 	import Files from './KnowledgeBase/Files.svelte';
@ -32,6 +32,8 @@

 	import AddContentMenu from './KnowledgeBase/AddContentMenu.svelte';
 	import AddTextContentModal from './KnowledgeBase/AddTextContentModal.svelte';
+	import AddYoutubeModal from './KnowledgeBase/AddYoutubeModal.svelte';
+

 	import SyncConfirmDialog from '../../common/ConfirmDialog.svelte';
 	import RichTextInput from '$lib/components/common/RichTextInput.svelte';
@ -64,6 +66,7 @@
 	let showAddTextContentModal = false;
 	let showSyncConfirmModal = false;
 	let showAccessControlModal = false;
+	let showAddYoutubeModal = false;

 	let inputFiles = null;

@ -584,6 +587,53 @@
 	}}
 />

+<AddYoutubeModal
+	bind:show={showAddYoutubeModal}
+	on:submit={async (e) => {
+		const url = e.detail.url;
+		
+		// Create a temporary file entry
+		const tempItemId = uuidv4();
+		const fileItem = {
+			type: 'youtube',
+			file: '',
+			id: null,
+			url: url,
+			name: url,  // We'll update this with video title later
+			size: 0,
+			status: 'uploading',
+			error: '',
+			itemId: tempItemId
+		};
+		
+		knowledge.files = [...(knowledge.files ?? []), fileItem];
+
+		// Process the YouTube video with knowledge base ID as collection
+		const res = await processYoutubeVideo(localStorage.token, url, id).catch((err) => {
+			toast.error(err);
+			return null;
+		});
+
+		if (res) {
+			// Add file to knowledge base using the ID from the response
+			const updatedKnowledge = await addFileToKnowledgeById(localStorage.token, id, res.id).catch((e) => {
+				toast.error(e);
+				return null;
+			});
+
+			if (updatedKnowledge) {
+				knowledge = updatedKnowledge;
+				toast.success($i18n.t('YouTube video processed successfully.'));
+			} else {
+				toast.error($i18n.t('Failed to add video to knowledge base.'));
+				knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId);
+			}
+		} else {
+			knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId);
+		}
+	}}
+/>
+
 <input
 	id="files-input"
 	bind:files={inputFiles}
@ -825,6 +875,8 @@
 												uploadDirectoryHandler();
 											} else if (e.detail.type === 'text') {
 												showAddTextContentModal = true;
+											} else if (e.detail.type === 'youtube') {
+													showAddYoutubeModal = true;
 											} else {
 												document.getElementById('files-input').click();
 											}
--- a/src/lib/components/workspace/Knowledge/KnowledgeBase/AddContentMenu.svelte
+++ b/src/lib/components/workspace/Knowledge/KnowledgeBase/AddContentMenu.svelte
@ -10,6 +10,7 @@
 	import BarsArrowUp from '$lib/components/icons/BarsArrowUp.svelte';
 	import FolderOpen from '$lib/components/icons/FolderOpen.svelte';
 	import ArrowPath from '$lib/components/icons/ArrowPath.svelte';
+	import Link from '$lib/components/icons/Link.svelte';

 	const i18n = getContext('i18n');

@ -102,6 +103,15 @@
 				<BarsArrowUp strokeWidth="2" />
 				<div class="flex items-center">{$i18n.t('Add text content')}</div>
 			</DropdownMenu.Item>
+			<DropdownMenu.Item
+			class="flex  gap-2  items-center px-3 py-2 text-sm  cursor-pointer hover:bg-gray-50 dark:hover:bg-gray-800 rounded-md"
+			on:click={() => {
+				dispatch('upload', { type: 'youtube' });
+			}}
+		>
+			<Link strokeWidth="2" />
+			<div class="flex items-center">{$i18n.t('Add YouTube URL')}</div>
+		</DropdownMenu.Item>
 		</DropdownMenu.Content>
 	</div>
 </Dropdown>
--- a/src/lib/components/workspace/Knowledge/KnowledgeBase/AddYoutubeModal.svelte
+++ b/src/lib/components/workspace/Knowledge/KnowledgeBase/AddYoutubeModal.svelte
@ -0,0 +1,83 @@
+<script lang="ts">
+	import { toast } from 'svelte-sonner';
+	import { getContext, createEventDispatcher } from 'svelte';
+	const i18n = getContext('i18n');
+	const dispatch = createEventDispatcher();
+
+	import Modal from '$lib/components/common/Modal.svelte';
+	import XMark from '$lib/components/icons/XMark.svelte';
+	import Tooltip from '$lib/components/common/Tooltip.svelte';
+
+	export let show = false;
+
+	let url = '';
+
+	// Basic YouTube URL validation
+	function isValidYoutubeUrl(url: string) {
+		const pattern = /^(https?:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.+$/;
+		return pattern.test(url);
+	}
+</script>
+
+<Modal size="md" className="bg-white dark:bg-gray-900" bind:show>
+	<div class="absolute top-0 right-0 p-5">
+		<button
+			class="self-center dark:text-white"
+			type="button"
+			on:click={() => {
+				show = false;
+			}}
+		>
+			<XMark className="size-3.5" />
+		</button>
+	</div>
+
+	<div class="flex flex-col w-full h-full md:space-x-4 dark:text-gray-200">
+		<form
+			class="flex flex-col w-full h-full"
+			on:submit|preventDefault={() => {
+				if (!url.trim()) {
+					toast.error($i18n.t('Please enter a YouTube URL.'));
+					return;
+				}
+
+				if (!isValidYoutubeUrl(url.trim())) {
+					toast.error($i18n.t('Please enter a valid YouTube URL.'));
+					return;
+				}
+
+				dispatch('submit', { url: url.trim() });
+				show = false;
+				url = '';
+			}}
+		>
+			<div class="flex-1 w-full h-full flex justify-center overflow-auto px-5 py-4">
+				<div class="max-w-md py-2 md:py-10 w-full flex flex-col gap-4">
+					<h2 class="text-xl font-semibold">{$i18n.t('Add YouTube Video')}</h2>
+					<div class="w-full">
+						<input
+							class="w-full p-2 border rounded dark:border-gray-700 bg-transparent"
+							type="text"
+							bind:value={url}
+							placeholder={$i18n.t('Enter YouTube URL')}
+							required
+						/>
+					</div>
+				</div>
+			</div>
+
+			<div class="flex flex-row items-center justify-end text-sm font-medium flex-shrink-0 mt-1 p-4 gap-1.5">
+				<div class="flex-shrink-0">
+					<Tooltip content={$i18n.t('Add')}>
+						<button
+							class="px-3.5 py-2 bg-black text-white dark:bg-white dark:text-black transition rounded-full"
+							type="submit"
+						>
+							{$i18n.t('Add')}
+						</button>
+					</Tooltip>
+				</div>
+			</div>
+		</form>
+	</div>
+</Modal>
--- a/src/lib/i18n/locales/en-US/translation.json
+++ b/src/lib/i18n/locales/en-US/translation.json
@ -411,6 +411,7 @@
 	"Export Tools": "",
 	"External Models": "",
 	"Failed to add file.": "",
+	"Failed to add video to knowledge base.":"",
 	"Failed to create API Key.": "",
 	"Failed to read clipboard contents": "",
 	"Failed to save models configuration": "",
@ -1061,5 +1062,6 @@
 	"Your account status is currently pending activation.": "",
 	"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "",
 	"Youtube": "",
-	"Youtube Loader Settings": ""
+	"Youtube Loader Settings": "",
+	"YouTube video processed successfully": ""
 }