diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 6fb4d2b43..63262a7ff 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -78,6 +78,7 @@ class YoutubeLoader:
# First try using YouTube Data API v3 if available
try:
from open_webui.config import YOUTUBE_API_KEY
+
if YOUTUBE_API_KEY:
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
response = requests.get(url)
@@ -93,7 +94,8 @@ class YoutubeLoader:
response = requests.get(url)
if response.status_code == 200:
import re
- title_match = re.search(r'
(.+?)', response.text)
+
+ title_match = re.search(r"(.+?)", response.text)
if title_match:
title = title_match.group(1)
return title
@@ -139,10 +141,14 @@ class YoutubeLoader:
transcript = transcript_list.find_transcript(self.language)
except NoTranscriptFound:
# Fallback: try to get any available transcript
- available_transcripts = list(transcript_list._generated_transcripts.values())
+ available_transcripts = list(
+ transcript_list._generated_transcripts.values()
+ )
if available_transcripts:
transcript = available_transcripts[0]
- log.info(f"Using first available transcript in language: {transcript.language_code}")
+ log.info(
+ f"Using first available transcript in language: {transcript.language_code}"
+ )
else:
log.error("No transcripts found for video")
return []
@@ -161,27 +167,29 @@ class YoutubeLoader:
# Combine pieces into a single text while tracking timestamp positions
full_text = ""
timestamp_map = []
-
+
for piece in transcript_pieces:
start_char = len(full_text)
text = piece["text"].strip()
full_text += text + " "
end_char = len(full_text)
-
- timestamp_map.append({
- "start": start_char,
- "end": end_char,
- "time": piece["start"],
- "duration": piece["duration"]
- })
+
+ timestamp_map.append(
+ {
+ "start": start_char,
+ "end": end_char,
+ "time": piece["start"],
+ "duration": piece["duration"],
+ }
+ )
# Create a single document that will be split by Langchain's text splitter
doc = Document(
page_content=full_text.strip(),
metadata={
**self._metadata,
- "timestamp_map": timestamp_map # Store timestamp mapping in metadata
- }
+ "timestamp_map": timestamp_map, # Store timestamp mapping in metadata
+ },
)
return [doc]
diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index 721592f55..ac80ca667 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -152,15 +152,15 @@ def get_rf(
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
return rf
+
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
parsed = urlparse(url)
query_dict = parse_qs(parsed.query)
- query_dict['t'] = [str(timestamp)]
+ query_dict["t"] = [str(timestamp)]
new_query = urlencode(query_dict, doseq=True)
return urlunparse(parsed._replace(query=new_query))
-
##########################################
#
# API routes
@@ -662,7 +662,9 @@ async def update_query_settings(
####################################
-def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
+def interpolate_timestamp(
+ chunk_start: int, chunk_end: int, timestamp_map: List[dict]
+) -> Tuple[float, float]:
"""
Find the appropriate timestamp for a chunk based on its character position
Returns (start_time, end_time) as floats in seconds
@@ -675,7 +677,8 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
else:
# If not found, use the closest previous timestamp
start_time = min(
- [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
+ [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0
+ )
# Find the timestamp entry that contains the end of our chunk
for entry in reversed(timestamp_map):
@@ -684,11 +687,14 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
break
else:
# If not found, use the closest next timestamp
- end_time = max([e["time"] + e["duration"]
- for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
+ end_time = max(
+ [e["time"] + e["duration"] for e in timestamp_map if e["end"] >= chunk_end],
+ default=start_time,
+ )
return start_time, end_time
+
def save_docs_to_vector_db(
request: Request,
docs,
@@ -733,12 +739,12 @@ def save_docs_to_vector_db(
if split:
# Check if this is a YouTube document by looking at the first doc's metadata
- is_youtube = (len(docs) == 1 and
- docs[0].metadata.get("type") == "youtube")
+ is_youtube = len(docs) == 1 and docs[0].metadata.get("type") == "youtube"
# Store timestamp_map before splitting if it's a YouTube document
- original_timestamp_map = docs[0].metadata.get(
- "timestamp_map") if is_youtube else None
+ original_timestamp_map = (
+ docs[0].metadata.get("timestamp_map") if is_youtube else None
+ )
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
text_splitter = RecursiveCharacterTextSplitter(
@@ -770,15 +776,17 @@ def save_docs_to_vector_db(
end_index = start_index + len(doc.page_content)
start_time, end_time = interpolate_timestamp(
- start_index,
- end_index,
- original_timestamp_map
+ start_index, end_index, original_timestamp_map
)
- doc.metadata.update({
- "start_time": start_time,
- "source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
- })
+ doc.metadata.update(
+ {
+ "start_time": start_time,
+ "source_url": add_timestamp_to_youtube_url(
+ doc.metadata["source_url"], int(start_time)
+ ),
+ }
+ )
# Remove the timestamp_map from individual chunks
doc.metadata.pop("timestamp_map", None)
@@ -915,10 +923,7 @@ def process_file(
"status": True,
"collection_name": form_data.collection_name,
"content": content,
- "file": {
- "id": file.id,
- "meta": metadata
- }
+ "file": {"id": file.id, "meta": metadata},
}
collection_name = form_data.collection_name
@@ -1148,7 +1153,7 @@ def process_youtube_video(
content = " ".join([doc.page_content for doc in docs])
log.debug(f"text_content: {content}")
- # Get video title from metadata or fallback to URL
+ # Get video title from metadata or fallback to URL
video_title = docs[0].metadata.get("title", form_data.url)
# Create a unique file ID for this video
@@ -1168,11 +1173,9 @@ def process_youtube_video(
"size": len(content),
"source": form_data.url,
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
- "type": "youtube"
+ "type": "youtube",
},
- "data": {
- "content": content
- }
+ "data": {"content": content},
}
),
)
@@ -1185,7 +1188,7 @@ def process_youtube_video(
"type": "youtube",
"name": video_title,
"file_id": file_id,
- "created_by": user.id if user else None
+ "created_by": user.id if user else None,
}
# Update all docs with the file metadata
@@ -1194,18 +1197,20 @@ def process_youtube_video(
# Debug log
log.info(f"Document metadata before saving: {doc.metadata}")
- save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)
+ save_docs_to_vector_db(
+ request, docs, collection_name, overwrite=False, add=True
+ )
return {
"status": True,
"collection_name": collection_name,
"id": file_id, # Return the file ID directly
- "filename": video_title,
+ "filename": video_title,
"file": {
"data": {
"content": content,
},
- "meta": file_metadata
+ "meta": file_metadata,
},
}
except Exception as e:
diff --git a/package-lock.json b/package-lock.json
index 3ae6220e3..f228148cd 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -12589,4 +12589,4 @@
}
}
}
-}
+}
\ No newline at end of file
diff --git a/package.json b/package.json
index fec9eb728..4da6fa88c 100644
--- a/package.json
+++ b/package.json
@@ -110,4 +110,4 @@
"node": ">=18.13.0 <=22.x.x",
"npm": ">=6.0.0"
}
-}
+}
\ No newline at end of file
diff --git a/src/lib/components/chat/Messages/CitationsModal.svelte b/src/lib/components/chat/Messages/CitationsModal.svelte
index e81daada9..fcb1f1dac 100644
--- a/src/lib/components/chat/Messages/CitationsModal.svelte
+++ b/src/lib/components/chat/Messages/CitationsModal.svelte
@@ -91,7 +91,9 @@