mirror of
synced 2025-03-03 10:52:09 +00:00
feat: Add YouTube Video Ingestion Support in Knowledge Base subsystem
This commit is contained in:
@ -69,6 +69,39 @@ class YoutubeLoader:
self.language = language
def _get_video_title(self) -> Optional[str]:
"""Get the video title using YouTube API or page scraping."""
import requests
import json
# First try using YouTube Data API v3 if available
from open_webui.config import YOUTUBE_API_KEY
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
if data.get("items"):
return data["items"][0]["snippet"]["title"]
except ImportError:
# Fallback to scraping the title from YouTube page
url = f"https://www.youtube.com/watch?v={self.video_id}"
response = requests.get(url)
if response.status_code == 200:
import re
title_match = re.search(r'<title>(.+?)</title>', response.text)
if title_match:
title = title_match.group(1)
return title
return None
except Exception as e:
print(f"Error getting video title: {e}")
return None
def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects."""
@ -102,16 +135,53 @@ class YoutubeLoader:
return []
# First try to get transcript in requested language
transcript = transcript_list.find_transcript(self.language)
except NoTranscriptFound:
transcript = transcript_list.find_transcript(["en"])
# Fallback: try to get any available transcript
available_transcripts = list(transcript_list._generated_transcripts.values())
if available_transcripts:
transcript = available_transcripts[0]
log.info(f"Using first available transcript in language: {transcript.language_code}")
log.error("No transcripts found for video")
return []
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript = " ".join(
lambda transcript_piece: transcript_piece["text"].strip(" "),
# Get video title and add it to base metadata
title = self._get_video_title()
if title:
self._metadata["title"] = title
# Add the base video URL to metadata
base_url = f"https://www.youtube.com/watch?v={self.video_id}"
self._metadata["source_url"] = base_url
# Combine pieces into a single text while tracking timestamp positions
full_text = ""
timestamp_map = []
for piece in transcript_pieces:
start_char = len(full_text)
text = piece["text"].strip()
full_text += text + " "
end_char = len(full_text)
"start": start_char,
"end": end_char,
"time": piece["start"],
"duration": piece["duration"]
# Create a single document that will be split by Langchain's text splitter
doc = Document(
"timestamp_map": timestamp_map # Store timestamp mapping in metadata
return [Document(page_content=transcript, metadata=self._metadata)]
return [doc]
@ -7,7 +7,7 @@ import shutil
import uuid
from datetime import datetime
from pathlib import Path
from typing import Iterator, List, Optional, Sequence, Union
from typing import Iterator, Optional, Sequence, Union, List, Dict, Any, Tuple
from fastapi import (
@ -28,7 +28,9 @@ import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain_core.documents import Document
from open_webui.models.files import FileModel, Files
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
from open_webui.models.files import FileModel, Files, FileForm
from open_webui.models.knowledge import Knowledges
from open_webui.storage.provider import Storage
@ -150,6 +152,14 @@ def get_rf(
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
return rf
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
parsed = urlparse(url)
query_dict = parse_qs(parsed.query)
query_dict['t'] = [str(timestamp)]
new_query = urlencode(query_dict, doseq=True)
return urlunparse(parsed._replace(query=new_query))
@ -652,6 +662,33 @@ async def update_query_settings(
def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
Find the appropriate timestamp for a chunk based on its character position
Returns (start_time, end_time) as floats in seconds
# Find the timestamp entry that contains the start of our chunk
for entry in timestamp_map:
if entry["start"] <= chunk_start <= entry["end"]:
start_time = entry["time"]
# If not found, use the closest previous timestamp
start_time = min(
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
# Find the timestamp entry that contains the end of our chunk
for entry in reversed(timestamp_map):
if entry["start"] <= chunk_end <= entry["end"]:
end_time = entry["time"] + entry["duration"]
# If not found, use the closest next timestamp
end_time = max([e["time"] + e["duration"]
for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
return start_time, end_time
def save_docs_to_vector_db(
request: Request,
@ -695,6 +732,14 @@ def save_docs_to_vector_db(
if split:
# Check if this is a YouTube document by looking at the first doc's metadata
is_youtube = (len(docs) == 1 and
docs[0].metadata.get("type") == "youtube")
# Store timestamp_map before splitting if it's a YouTube document
original_timestamp_map = docs[0].metadata.get(
"timestamp_map") if is_youtube else None
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
text_splitter = RecursiveCharacterTextSplitter(
@ -718,27 +763,64 @@ def save_docs_to_vector_db(
docs = text_splitter.split_documents(docs)
# Only process timestamps for YouTube documents
if is_youtube and original_timestamp_map:
for doc in docs:
start_index = doc.metadata.get("start_index", 0)
end_index = start_index + len(doc.page_content)
start_time, end_time = interpolate_timestamp(
"start_time": start_time,
"source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
# Remove the timestamp_map from individual chunks
doc.metadata.pop("timestamp_map", None)
if len(docs) == 0:
texts = [doc.page_content for doc in docs]
metadatas = [
**(metadata if metadata else {}),
"embedding_config": json.dumps(
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
for doc in docs
metadatas = []
for doc in docs:
# Preserve the original metadata
doc_metadata = doc.metadata.copy()
# Add any additional metadata
if metadata:
# Ensure source and source_url are preserved
if "source_url" in doc_metadata:
doc_metadata["source"] = doc_metadata["source_url"]
# Add embedding config
doc_metadata["embedding_config"] = json.dumps(
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
# Convert datetime objects to strings
for key, value in doc_metadata.items():
if isinstance(value, datetime):
doc_metadata[key] = str(value)
# Debug log for final metadata
log.info(f"Final document metadata for ChromaDB: {doc_metadata}")
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for metadata in metadatas:
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for key, value in metadata.items():
if isinstance(value, datetime):
metadata[key] = str(value)
@ -803,6 +885,8 @@ class ProcessFileForm(BaseModel):
file_id: str
content: Optional[str] = None
collection_name: Optional[str] = None
type: Optional[str] = "file" # Default to 'file' if not specified
url: Optional[str] = None # URL for web content
@ -813,12 +897,41 @@ def process_file(
file = Files.get_file_by_id(form_data.file_id)
content = file.data.get("content", "")
# Create base metadata
metadata = {
**file.meta, # Original file metadata
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
# For YouTube content, we skip embedding but still process the file association
if "type" in metadata and metadata["type"] == "youtube":
log.info("Processing YouTube content - skipping embedding")
return {
"status": True,
"collection_name": form_data.collection_name,
"content": content,
"file": {
"id": file.id,
"meta": metadata
collection_name = form_data.collection_name
if collection_name is None:
collection_name = f"file-{file.id}"
# Get the document type, default to 'file' if not specified
doc_type = form_data.type if form_data.type else "file"
# Get source URL if available
source = form_data.url if form_data.url else file.filename
if form_data.content:
# Update the content in the file
# Usage: /files/{file_id}/data/content/update
@ -833,11 +946,11 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
text_content = form_data.content
elif form_data.collection_name:
# Check if the file has already been processed and save the content
@ -851,7 +964,11 @@ def process_file(
docs = [
"type": doc_type,
"source": source,
for idx, id in enumerate(result.ids[0])
@ -864,7 +981,8 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
@ -893,7 +1011,8 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
for doc in docs
@ -907,7 +1026,8 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
@ -919,6 +1039,11 @@ def process_file(
{"content": text_content},
{"content": text_content},
hash = calculate_sha256_string(text_content)
Files.update_file_hash_by_id(file.id, hash)
@ -1023,19 +1148,64 @@ def process_youtube_video(
content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {content}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=True)
# Get video title from metadata or fallback to URL
video_title = docs[0].metadata.get("title", form_data.url)
# Create a unique file ID for this video
file_id = str(uuid.uuid4())
# Create a file record
file_item = Files.insert_new_file(
user.id if user else None,
"id": file_id,
"filename": video_title,
"path": form_data.url, # Use the video URL as the path
"meta": {
"name": video_title,
"content_type": "text/plain",
"size": len(content),
"source": form_data.url,
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
"type": "youtube"
"data": {
"content": content
# Add file-specific metadata
file_metadata = {
"source": form_data.url,
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
"title": video_title,
"type": "youtube",
"name": video_title,
"file_id": file_id,
"created_by": user.id if user else None
# Update all docs with the file metadata
for doc in docs:
# Debug log
log.info(f"Document metadata before saving: {doc.metadata}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)
return {
"status": True,
"collection_name": collection_name,
"filename": form_data.url,
"id": file_id, # Return the file ID directly
"filename": video_title,
"file": {
"data": {
"content": content,
"meta": {
"name": form_data.url,
"meta": file_metadata
except Exception as e:
@ -344,7 +344,7 @@ export const processFile = async (
return res;
export const processYoutubeVideo = async (token: string, url: string) => {
export const processYoutubeVideo = async (token: string, url: string, collection_name: string) => {
let error = null;
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/youtube`, {
@ -355,7 +355,8 @@ export const processYoutubeVideo = async (token: string, url: string) => {
authorization: `Bearer ${token}`
body: JSON.stringify({
url: url
url: url,
collection_name: collection_name
.then(async (res) => {
@ -24,7 +24,7 @@
import { transcribeAudio } from '$lib/apis/audio';
import { blobToFile } from '$lib/utils';
import { processFile } from '$lib/apis/retrieval';
import { processFile, processYoutubeVideo } from '$lib/apis/retrieval';
import Spinner from '$lib/components/common/Spinner.svelte';
import Files from './KnowledgeBase/Files.svelte';
@ -32,6 +32,8 @@
import AddContentMenu from './KnowledgeBase/AddContentMenu.svelte';
import AddTextContentModal from './KnowledgeBase/AddTextContentModal.svelte';
import AddYoutubeModal from './KnowledgeBase/AddYoutubeModal.svelte';
import SyncConfirmDialog from '../../common/ConfirmDialog.svelte';
import RichTextInput from '$lib/components/common/RichTextInput.svelte';
@ -64,6 +66,7 @@
let showAddTextContentModal = false;
let showSyncConfirmModal = false;
let showAccessControlModal = false;
let showAddYoutubeModal = false;
let inputFiles = null;
@ -584,6 +587,53 @@
on:submit={async (e) => {
const url = e.detail.url;
// Create a temporary file entry
const tempItemId = uuidv4();
const fileItem = {
type: 'youtube',
file: '',
id: null,
url: url,
name: url, // We'll update this with video title later
size: 0,
status: 'uploading',
error: '',
itemId: tempItemId
knowledge.files = [...(knowledge.files ?? []), fileItem];
// Process the YouTube video with knowledge base ID as collection
const res = await processYoutubeVideo(localStorage.token, url, id).catch((err) => {
return null;
if (res) {
// Add file to knowledge base using the ID from the response
const updatedKnowledge = await addFileToKnowledgeById(localStorage.token, id, res.id).catch((e) => {
return null;
if (updatedKnowledge) {
knowledge = updatedKnowledge;
toast.success($i18n.t('YouTube video processed successfully.'));
} else {
toast.error($i18n.t('Failed to add video to knowledge base.'));
knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId);
} else {
knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId);
@ -825,6 +875,8 @@
} else if (e.detail.type === 'text') {
showAddTextContentModal = true;
} else if (e.detail.type === 'youtube') {
showAddYoutubeModal = true;
} else {
@ -10,6 +10,7 @@
import BarsArrowUp from '$lib/components/icons/BarsArrowUp.svelte';
import FolderOpen from '$lib/components/icons/FolderOpen.svelte';
import ArrowPath from '$lib/components/icons/ArrowPath.svelte';
import Link from '$lib/components/icons/Link.svelte';
const i18n = getContext('i18n');
@ -102,6 +103,15 @@
<BarsArrowUp strokeWidth="2" />
<div class="flex items-center">{$i18n.t('Add text content')}</div>
class="flex gap-2 items-center px-3 py-2 text-sm cursor-pointer hover:bg-gray-50 dark:hover:bg-gray-800 rounded-md"
on:click={() => {
dispatch('upload', { type: 'youtube' });
<Link strokeWidth="2" />
<div class="flex items-center">{$i18n.t('Add YouTube URL')}</div>
@ -0,0 +1,83 @@
<script lang="ts">
import { toast } from 'svelte-sonner';
import { getContext, createEventDispatcher } from 'svelte';
const i18n = getContext('i18n');
const dispatch = createEventDispatcher();
import Modal from '$lib/components/common/Modal.svelte';
import XMark from '$lib/components/icons/XMark.svelte';
import Tooltip from '$lib/components/common/Tooltip.svelte';
export let show = false;
let url = '';
// Basic YouTube URL validation
function isValidYoutubeUrl(url: string) {
const pattern = /^(https?:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.+$/;
return pattern.test(url);
<Modal size="md" className="bg-white dark:bg-gray-900" bind:show>
<div class="absolute top-0 right-0 p-5">
class="self-center dark:text-white"
on:click={() => {
show = false;
<XMark className="size-3.5" />
<div class="flex flex-col w-full h-full md:space-x-4 dark:text-gray-200">
class="flex flex-col w-full h-full"
on:submit|preventDefault={() => {
if (!url.trim()) {
toast.error($i18n.t('Please enter a YouTube URL.'));
if (!isValidYoutubeUrl(url.trim())) {
toast.error($i18n.t('Please enter a valid YouTube URL.'));
dispatch('submit', { url: url.trim() });
show = false;
url = '';
<div class="flex-1 w-full h-full flex justify-center overflow-auto px-5 py-4">
<div class="max-w-md py-2 md:py-10 w-full flex flex-col gap-4">
<h2 class="text-xl font-semibold">{$i18n.t('Add YouTube Video')}</h2>
<div class="w-full">
class="w-full p-2 border rounded dark:border-gray-700 bg-transparent"
placeholder={$i18n.t('Enter YouTube URL')}
<div class="flex flex-row items-center justify-end text-sm font-medium flex-shrink-0 mt-1 p-4 gap-1.5">
<div class="flex-shrink-0">
<Tooltip content={$i18n.t('Add')}>
class="px-3.5 py-2 bg-black text-white dark:bg-white dark:text-black transition rounded-full"
@ -411,6 +411,7 @@
"Export Tools": "",
"External Models": "",
"Failed to add file.": "",
"Failed to add video to knowledge base.":"",
"Failed to create API Key.": "",
"Failed to read clipboard contents": "",
"Failed to save models configuration": "",
@ -1061,5 +1062,6 @@
"Your account status is currently pending activation.": "",
"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "",
"Youtube": "",
"Youtube Loader Settings": ""
"Youtube Loader Settings": "",
"YouTube video processed successfully": ""
Reference in New Issue
Block a user