feat: Add YouTube Video Ingestion Support in Knowledge Base subsystem

This commit is contained in:
Juanan Pereira 2024-12-30 13:53:08 +01:00
parent f6a54c96bc
commit ba3fe33ef8
7 changed files with 427 additions and 39 deletions

View File

@ -69,6 +69,39 @@ class YoutubeLoader:
else:
self.language = language
def _get_video_title(self) -> Optional[str]:
"""Get the video title using YouTube API or page scraping."""
try:
import requests
import json
# First try using YouTube Data API v3 if available
try:
from open_webui.config import YOUTUBE_API_KEY
if YOUTUBE_API_KEY:
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
if data.get("items"):
return data["items"][0]["snippet"]["title"]
except ImportError:
pass
# Fallback to scraping the title from YouTube page
url = f"https://www.youtube.com/watch?v={self.video_id}"
response = requests.get(url)
if response.status_code == 200:
import re
title_match = re.search(r'<title>(.+?)</title>', response.text)
if title_match:
title = title_match.group(1)
return title
return None
except Exception as e:
print(f"Error getting video title: {e}")
return None
def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects."""
try:
@ -102,16 +135,53 @@ class YoutubeLoader:
return []
try:
# First try to get transcript in requested language
transcript = transcript_list.find_transcript(self.language)
except NoTranscriptFound:
transcript = transcript_list.find_transcript(["en"])
# Fallback: try to get any available transcript
available_transcripts = list(transcript_list._generated_transcripts.values())
if available_transcripts:
transcript = available_transcripts[0]
log.info(f"Using first available transcript in language: {transcript.language_code}")
else:
log.error("No transcripts found for video")
return []
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript = " ".join(
map(
lambda transcript_piece: transcript_piece["text"].strip(" "),
transcript_pieces,
)
# Get video title and add it to base metadata
title = self._get_video_title()
if title:
self._metadata["title"] = title
# Add the base video URL to metadata
base_url = f"https://www.youtube.com/watch?v={self.video_id}"
self._metadata["source_url"] = base_url
# Combine pieces into a single text while tracking timestamp positions
full_text = ""
timestamp_map = []
for piece in transcript_pieces:
start_char = len(full_text)
text = piece["text"].strip()
full_text += text + " "
end_char = len(full_text)
timestamp_map.append({
"start": start_char,
"end": end_char,
"time": piece["start"],
"duration": piece["duration"]
})
# Create a single document that will be split by Langchain's text splitter
doc = Document(
page_content=full_text.strip(),
metadata={
**self._metadata,
"timestamp_map": timestamp_map # Store timestamp mapping in metadata
}
)
return [Document(page_content=transcript, metadata=self._metadata)]
return [doc]

View File

@ -7,7 +7,7 @@ import shutil
import uuid
from datetime import datetime
from pathlib import Path
from typing import Iterator, List, Optional, Sequence, Union
from typing import Iterator, Optional, Sequence, Union, List, Dict, Any, Tuple
from fastapi import (
Depends,
@ -28,7 +28,9 @@ import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain_core.documents import Document
from open_webui.models.files import FileModel, Files
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
from open_webui.models.files import FileModel, Files, FileForm
from open_webui.models.knowledge import Knowledges
from open_webui.storage.provider import Storage
@ -150,6 +152,14 @@ def get_rf(
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
return rf
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
parsed = urlparse(url)
query_dict = parse_qs(parsed.query)
query_dict['t'] = [str(timestamp)]
new_query = urlencode(query_dict, doseq=True)
return urlunparse(parsed._replace(query=new_query))
##########################################
#
@ -652,6 +662,33 @@ async def update_query_settings(
####################################
def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
"""
Find the appropriate timestamp for a chunk based on its character position
Returns (start_time, end_time) as floats in seconds
"""
# Find the timestamp entry that contains the start of our chunk
for entry in timestamp_map:
if entry["start"] <= chunk_start <= entry["end"]:
start_time = entry["time"]
break
else:
# If not found, use the closest previous timestamp
start_time = min(
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
# Find the timestamp entry that contains the end of our chunk
for entry in reversed(timestamp_map):
if entry["start"] <= chunk_end <= entry["end"]:
end_time = entry["time"] + entry["duration"]
break
else:
# If not found, use the closest next timestamp
end_time = max([e["time"] + e["duration"]
for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
return start_time, end_time
def save_docs_to_vector_db(
request: Request,
docs,
@ -695,6 +732,14 @@ def save_docs_to_vector_db(
raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
if split:
# Check if this is a YouTube document by looking at the first doc's metadata
is_youtube = (len(docs) == 1 and
docs[0].metadata.get("type") == "youtube")
# Store timestamp_map before splitting if it's a YouTube document
original_timestamp_map = docs[0].metadata.get(
"timestamp_map") if is_youtube else None
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=request.app.state.config.CHUNK_SIZE,
@ -718,27 +763,64 @@ def save_docs_to_vector_db(
docs = text_splitter.split_documents(docs)
# Only process timestamps for YouTube documents
if is_youtube and original_timestamp_map:
for doc in docs:
start_index = doc.metadata.get("start_index", 0)
end_index = start_index + len(doc.page_content)
start_time, end_time = interpolate_timestamp(
start_index,
end_index,
original_timestamp_map
)
doc.metadata.update({
"start_time": start_time,
"source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
})
# Remove the timestamp_map from individual chunks
doc.metadata.pop("timestamp_map", None)
if len(docs) == 0:
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
texts = [doc.page_content for doc in docs]
metadatas = [
{
**doc.metadata,
**(metadata if metadata else {}),
"embedding_config": json.dumps(
{
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
}
),
}
for doc in docs
]
metadatas = []
for doc in docs:
# Preserve the original metadata
doc_metadata = doc.metadata.copy()
# Add any additional metadata
if metadata:
doc_metadata.update(metadata)
# Ensure source and source_url are preserved
if "source_url" in doc_metadata:
doc_metadata["source"] = doc_metadata["source_url"]
# Add embedding config
doc_metadata["embedding_config"] = json.dumps(
{
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
}
)
# Convert datetime objects to strings
for key, value in doc_metadata.items():
if isinstance(value, datetime):
doc_metadata[key] = str(value)
# Debug log for final metadata
log.info(f"Final document metadata for ChromaDB: {doc_metadata}")
metadatas.append(doc_metadata)
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for metadata in metadatas:
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for key, value in metadata.items():
if isinstance(value, datetime):
metadata[key] = str(value)
@ -803,6 +885,8 @@ class ProcessFileForm(BaseModel):
file_id: str
content: Optional[str] = None
collection_name: Optional[str] = None
type: Optional[str] = "file" # Default to 'file' if not specified
url: Optional[str] = None # URL for web content
@router.post("/process/file")
@ -813,12 +897,41 @@ def process_file(
):
try:
file = Files.get_file_by_id(form_data.file_id)
content = file.data.get("content", "")
# Create base metadata
metadata = {
**file.meta, # Original file metadata
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
}
# For YouTube content, we skip embedding but still process the file association
if "type" in metadata and metadata["type"] == "youtube":
log.info("Processing YouTube content - skipping embedding")
return {
"status": True,
"collection_name": form_data.collection_name,
"content": content,
"file": {
"id": file.id,
"meta": metadata
}
}
collection_name = form_data.collection_name
if collection_name is None:
collection_name = f"file-{file.id}"
# Get the document type, default to 'file' if not specified
doc_type = form_data.type if form_data.type else "file"
# Get source URL if available
source = form_data.url if form_data.url else file.filename
if form_data.content:
# Update the content in the file
# Usage: /files/{file_id}/data/content/update
@ -833,11 +946,11 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
},
)
]
text_content = form_data.content
elif form_data.collection_name:
# Check if the file has already been processed and save the content
@ -851,7 +964,11 @@ def process_file(
docs = [
Document(
page_content=result.documents[0][idx],
metadata=result.metadatas[0][idx],
metadata={
**result.metadatas[0][idx],
"type": doc_type,
"source": source,
},
)
for idx, id in enumerate(result.ids[0])
]
@ -864,7 +981,8 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
},
)
]
@ -893,7 +1011,8 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
},
)
for doc in docs
@ -907,7 +1026,8 @@ def process_file(
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,
"source": file.filename,
"source": source,
"type": doc_type,
},
)
]
@ -919,6 +1039,11 @@ def process_file(
{"content": text_content},
)
Files.update_file_data_by_id(
file.id,
{"content": text_content},
)
hash = calculate_sha256_string(text_content)
Files.update_file_hash_by_id(file.id, hash)
@ -1023,19 +1148,64 @@ def process_youtube_video(
content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {content}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=True)
# Get video title from metadata or fallback to URL
video_title = docs[0].metadata.get("title", form_data.url)
# Create a unique file ID for this video
file_id = str(uuid.uuid4())
# Create a file record
file_item = Files.insert_new_file(
user.id if user else None,
FileForm(
**{
"id": file_id,
"filename": video_title,
"path": form_data.url, # Use the video URL as the path
"meta": {
"name": video_title,
"content_type": "text/plain",
"size": len(content),
"source": form_data.url,
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
"type": "youtube"
},
"data": {
"content": content
}
}
),
)
# Add file-specific metadata
file_metadata = {
"source": form_data.url,
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
"title": video_title,
"type": "youtube",
"name": video_title,
"file_id": file_id,
"created_by": user.id if user else None
}
# Update all docs with the file metadata
for doc in docs:
doc.metadata.update(file_metadata)
# Debug log
log.info(f"Document metadata before saving: {doc.metadata}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)
return {
"status": True,
"collection_name": collection_name,
"filename": form_data.url,
"id": file_id, # Return the file ID directly
"filename": video_title,
"file": {
"data": {
"content": content,
},
"meta": {
"name": form_data.url,
},
"meta": file_metadata
},
}
except Exception as e:

View File

@ -344,7 +344,7 @@ export const processFile = async (
return res;
};
export const processYoutubeVideo = async (token: string, url: string) => {
export const processYoutubeVideo = async (token: string, url: string, collection_name: string) => {
let error = null;
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/youtube`, {
@ -355,7 +355,8 @@ export const processYoutubeVideo = async (token: string, url: string) => {
authorization: `Bearer ${token}`
},
body: JSON.stringify({
url: url
url: url,
collection_name: collection_name
})
})
.then(async (res) => {

View File

@ -24,7 +24,7 @@
import { transcribeAudio } from '$lib/apis/audio';
import { blobToFile } from '$lib/utils';
import { processFile } from '$lib/apis/retrieval';
import { processFile, processYoutubeVideo } from '$lib/apis/retrieval';
import Spinner from '$lib/components/common/Spinner.svelte';
import Files from './KnowledgeBase/Files.svelte';
@ -32,6 +32,8 @@
import AddContentMenu from './KnowledgeBase/AddContentMenu.svelte';
import AddTextContentModal from './KnowledgeBase/AddTextContentModal.svelte';
import AddYoutubeModal from './KnowledgeBase/AddYoutubeModal.svelte';
import SyncConfirmDialog from '../../common/ConfirmDialog.svelte';
import RichTextInput from '$lib/components/common/RichTextInput.svelte';
@ -64,6 +66,7 @@
let showAddTextContentModal = false;
let showSyncConfirmModal = false;
let showAccessControlModal = false;
let showAddYoutubeModal = false;
let inputFiles = null;
@ -584,6 +587,53 @@
}}
/>
<AddYoutubeModal
bind:show={showAddYoutubeModal}
on:submit={async (e) => {
const url = e.detail.url;
// Create a temporary file entry
const tempItemId = uuidv4();
const fileItem = {
type: 'youtube',
file: '',
id: null,
url: url,
name: url, // We'll update this with video title later
size: 0,
status: 'uploading',
error: '',
itemId: tempItemId
};
knowledge.files = [...(knowledge.files ?? []), fileItem];
// Process the YouTube video with knowledge base ID as collection
const res = await processYoutubeVideo(localStorage.token, url, id).catch((err) => {
toast.error(err);
return null;
});
if (res) {
// Add file to knowledge base using the ID from the response
const updatedKnowledge = await addFileToKnowledgeById(localStorage.token, id, res.id).catch((e) => {
toast.error(e);
return null;
});
if (updatedKnowledge) {
knowledge = updatedKnowledge;
toast.success($i18n.t('YouTube video processed successfully.'));
} else {
toast.error($i18n.t('Failed to add video to knowledge base.'));
knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId);
}
} else {
knowledge.files = knowledge.files.filter(f => f.itemId !== tempItemId);
}
}}
/>
<input
id="files-input"
bind:files={inputFiles}
@ -825,6 +875,8 @@
uploadDirectoryHandler();
} else if (e.detail.type === 'text') {
showAddTextContentModal = true;
} else if (e.detail.type === 'youtube') {
showAddYoutubeModal = true;
} else {
document.getElementById('files-input').click();
}

View File

@ -10,6 +10,7 @@
import BarsArrowUp from '$lib/components/icons/BarsArrowUp.svelte';
import FolderOpen from '$lib/components/icons/FolderOpen.svelte';
import ArrowPath from '$lib/components/icons/ArrowPath.svelte';
import Link from '$lib/components/icons/Link.svelte';
const i18n = getContext('i18n');
@ -102,6 +103,15 @@
<BarsArrowUp strokeWidth="2" />
<div class="flex items-center">{$i18n.t('Add text content')}</div>
</DropdownMenu.Item>
<DropdownMenu.Item
class="flex gap-2 items-center px-3 py-2 text-sm cursor-pointer hover:bg-gray-50 dark:hover:bg-gray-800 rounded-md"
on:click={() => {
dispatch('upload', { type: 'youtube' });
}}
>
<Link strokeWidth="2" />
<div class="flex items-center">{$i18n.t('Add YouTube URL')}</div>
</DropdownMenu.Item>
</DropdownMenu.Content>
</div>
</Dropdown>

View File

@ -0,0 +1,83 @@
<script lang="ts">
import { toast } from 'svelte-sonner';
import { getContext, createEventDispatcher } from 'svelte';
const i18n = getContext('i18n');
const dispatch = createEventDispatcher();
import Modal from '$lib/components/common/Modal.svelte';
import XMark from '$lib/components/icons/XMark.svelte';
import Tooltip from '$lib/components/common/Tooltip.svelte';
export let show = false;
let url = '';
// Basic YouTube URL validation
function isValidYoutubeUrl(url: string) {
const pattern = /^(https?:\/\/)?(www\.)?(youtube\.com|youtu\.?be)\/.+$/;
return pattern.test(url);
}
</script>
<Modal size="md" className="bg-white dark:bg-gray-900" bind:show>
<div class="absolute top-0 right-0 p-5">
<button
class="self-center dark:text-white"
type="button"
on:click={() => {
show = false;
}}
>
<XMark className="size-3.5" />
</button>
</div>
<div class="flex flex-col w-full h-full md:space-x-4 dark:text-gray-200">
<form
class="flex flex-col w-full h-full"
on:submit|preventDefault={() => {
if (!url.trim()) {
toast.error($i18n.t('Please enter a YouTube URL.'));
return;
}
if (!isValidYoutubeUrl(url.trim())) {
toast.error($i18n.t('Please enter a valid YouTube URL.'));
return;
}
dispatch('submit', { url: url.trim() });
show = false;
url = '';
}}
>
<div class="flex-1 w-full h-full flex justify-center overflow-auto px-5 py-4">
<div class="max-w-md py-2 md:py-10 w-full flex flex-col gap-4">
<h2 class="text-xl font-semibold">{$i18n.t('Add YouTube Video')}</h2>
<div class="w-full">
<input
class="w-full p-2 border rounded dark:border-gray-700 bg-transparent"
type="text"
bind:value={url}
placeholder={$i18n.t('Enter YouTube URL')}
required
/>
</div>
</div>
</div>
<div class="flex flex-row items-center justify-end text-sm font-medium flex-shrink-0 mt-1 p-4 gap-1.5">
<div class="flex-shrink-0">
<Tooltip content={$i18n.t('Add')}>
<button
class="px-3.5 py-2 bg-black text-white dark:bg-white dark:text-black transition rounded-full"
type="submit"
>
{$i18n.t('Add')}
</button>
</Tooltip>
</div>
</div>
</form>
</div>
</Modal>

View File

@ -411,6 +411,7 @@
"Export Tools": "",
"External Models": "",
"Failed to add file.": "",
"Failed to add video to knowledge base.":"",
"Failed to create API Key.": "",
"Failed to read clipboard contents": "",
"Failed to save models configuration": "",
@ -1061,5 +1062,6 @@
"Your account status is currently pending activation.": "",
"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "",
"Youtube": "",
"Youtube Loader Settings": ""
"Youtube Loader Settings": "",
"YouTube video processed successfully": ""
}