feat: Update CitationsModal to use source_url for YouTube documents and file_id for others

This commit is contained in:
Juanan Pereira 2024-12-31 12:47:32 +01:00
parent ba3fe33ef8
commit 49f462163a
8 changed files with 64 additions and 49 deletions

View File

@ -78,6 +78,7 @@ class YoutubeLoader:
# First try using YouTube Data API v3 if available # First try using YouTube Data API v3 if available
try: try:
from open_webui.config import YOUTUBE_API_KEY from open_webui.config import YOUTUBE_API_KEY
if YOUTUBE_API_KEY: if YOUTUBE_API_KEY:
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet" url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
response = requests.get(url) response = requests.get(url)
@ -93,7 +94,8 @@ class YoutubeLoader:
response = requests.get(url) response = requests.get(url)
if response.status_code == 200: if response.status_code == 200:
import re import re
title_match = re.search(r'<title>(.+?)</title>', response.text)
title_match = re.search(r"<title>(.+?)</title>", response.text)
if title_match: if title_match:
title = title_match.group(1) title = title_match.group(1)
return title return title
@ -139,10 +141,14 @@ class YoutubeLoader:
transcript = transcript_list.find_transcript(self.language) transcript = transcript_list.find_transcript(self.language)
except NoTranscriptFound: except NoTranscriptFound:
# Fallback: try to get any available transcript # Fallback: try to get any available transcript
available_transcripts = list(transcript_list._generated_transcripts.values()) available_transcripts = list(
transcript_list._generated_transcripts.values()
)
if available_transcripts: if available_transcripts:
transcript = available_transcripts[0] transcript = available_transcripts[0]
log.info(f"Using first available transcript in language: {transcript.language_code}") log.info(
f"Using first available transcript in language: {transcript.language_code}"
)
else: else:
log.error("No transcripts found for video") log.error("No transcripts found for video")
return [] return []
@ -168,20 +174,22 @@ class YoutubeLoader:
full_text += text + " " full_text += text + " "
end_char = len(full_text) end_char = len(full_text)
timestamp_map.append({ timestamp_map.append(
{
"start": start_char, "start": start_char,
"end": end_char, "end": end_char,
"time": piece["start"], "time": piece["start"],
"duration": piece["duration"] "duration": piece["duration"],
}) }
)
# Create a single document that will be split by Langchain's text splitter # Create a single document that will be split by Langchain's text splitter
doc = Document( doc = Document(
page_content=full_text.strip(), page_content=full_text.strip(),
metadata={ metadata={
**self._metadata, **self._metadata,
"timestamp_map": timestamp_map # Store timestamp mapping in metadata "timestamp_map": timestamp_map, # Store timestamp mapping in metadata
} },
) )
return [doc] return [doc]

View File

@ -152,15 +152,15 @@ def get_rf(
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error")) raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
return rf return rf
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str: def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
parsed = urlparse(url) parsed = urlparse(url)
query_dict = parse_qs(parsed.query) query_dict = parse_qs(parsed.query)
query_dict['t'] = [str(timestamp)] query_dict["t"] = [str(timestamp)]
new_query = urlencode(query_dict, doseq=True) new_query = urlencode(query_dict, doseq=True)
return urlunparse(parsed._replace(query=new_query)) return urlunparse(parsed._replace(query=new_query))
########################################## ##########################################
# #
# API routes # API routes
@ -662,7 +662,9 @@ async def update_query_settings(
#################################### ####################################
def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]: def interpolate_timestamp(
chunk_start: int, chunk_end: int, timestamp_map: List[dict]
) -> Tuple[float, float]:
""" """
Find the appropriate timestamp for a chunk based on its character position Find the appropriate timestamp for a chunk based on its character position
Returns (start_time, end_time) as floats in seconds Returns (start_time, end_time) as floats in seconds
@ -675,7 +677,8 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
else: else:
# If not found, use the closest previous timestamp # If not found, use the closest previous timestamp
start_time = min( start_time = min(
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0) [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0
)
# Find the timestamp entry that contains the end of our chunk # Find the timestamp entry that contains the end of our chunk
for entry in reversed(timestamp_map): for entry in reversed(timestamp_map):
@ -684,11 +687,14 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
break break
else: else:
# If not found, use the closest next timestamp # If not found, use the closest next timestamp
end_time = max([e["time"] + e["duration"] end_time = max(
for e in timestamp_map if e["end"] >= chunk_end], default=start_time) [e["time"] + e["duration"] for e in timestamp_map if e["end"] >= chunk_end],
default=start_time,
)
return start_time, end_time return start_time, end_time
def save_docs_to_vector_db( def save_docs_to_vector_db(
request: Request, request: Request,
docs, docs,
@ -733,12 +739,12 @@ def save_docs_to_vector_db(
if split: if split:
# Check if this is a YouTube document by looking at the first doc's metadata # Check if this is a YouTube document by looking at the first doc's metadata
is_youtube = (len(docs) == 1 and is_youtube = len(docs) == 1 and docs[0].metadata.get("type") == "youtube"
docs[0].metadata.get("type") == "youtube")
# Store timestamp_map before splitting if it's a YouTube document # Store timestamp_map before splitting if it's a YouTube document
original_timestamp_map = docs[0].metadata.get( original_timestamp_map = (
"timestamp_map") if is_youtube else None docs[0].metadata.get("timestamp_map") if is_youtube else None
)
if request.app.state.config.TEXT_SPLITTER in ["", "character"]: if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
text_splitter = RecursiveCharacterTextSplitter( text_splitter = RecursiveCharacterTextSplitter(
@ -770,15 +776,17 @@ def save_docs_to_vector_db(
end_index = start_index + len(doc.page_content) end_index = start_index + len(doc.page_content)
start_time, end_time = interpolate_timestamp( start_time, end_time = interpolate_timestamp(
start_index, start_index, end_index, original_timestamp_map
end_index,
original_timestamp_map
) )
doc.metadata.update({ doc.metadata.update(
{
"start_time": start_time, "start_time": start_time,
"source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time)) "source_url": add_timestamp_to_youtube_url(
}) doc.metadata["source_url"], int(start_time)
),
}
)
# Remove the timestamp_map from individual chunks # Remove the timestamp_map from individual chunks
doc.metadata.pop("timestamp_map", None) doc.metadata.pop("timestamp_map", None)
@ -915,10 +923,7 @@ def process_file(
"status": True, "status": True,
"collection_name": form_data.collection_name, "collection_name": form_data.collection_name,
"content": content, "content": content,
"file": { "file": {"id": file.id, "meta": metadata},
"id": file.id,
"meta": metadata
}
} }
collection_name = form_data.collection_name collection_name = form_data.collection_name
@ -1168,11 +1173,9 @@ def process_youtube_video(
"size": len(content), "size": len(content),
"source": form_data.url, "source": form_data.url,
"source_url": add_timestamp_to_youtube_url(form_data.url, 0), "source_url": add_timestamp_to_youtube_url(form_data.url, 0),
"type": "youtube" "type": "youtube",
}, },
"data": { "data": {"content": content},
"content": content
}
} }
), ),
) )
@ -1185,7 +1188,7 @@ def process_youtube_video(
"type": "youtube", "type": "youtube",
"name": video_title, "name": video_title,
"file_id": file_id, "file_id": file_id,
"created_by": user.id if user else None "created_by": user.id if user else None,
} }
# Update all docs with the file metadata # Update all docs with the file metadata
@ -1194,7 +1197,9 @@ def process_youtube_video(
# Debug log # Debug log
log.info(f"Document metadata before saving: {doc.metadata}") log.info(f"Document metadata before saving: {doc.metadata}")
save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True) save_docs_to_vector_db(
request, docs, collection_name, overwrite=False, add=True
)
return { return {
"status": True, "status": True,
@ -1205,7 +1210,7 @@ def process_youtube_video(
"data": { "data": {
"content": content, "content": content,
}, },
"meta": file_metadata "meta": file_metadata,
}, },
} }
except Exception as e: except Exception as e:

View File

@ -91,7 +91,9 @@
<div class="text-sm dark:text-gray-400 flex items-center gap-2 w-fit"> <div class="text-sm dark:text-gray-400 flex items-center gap-2 w-fit">
<a <a
class="hover:text-gray-500 hover:dark:text-gray-100 underline flex-grow" class="hover:text-gray-500 hover:dark:text-gray-100 underline flex-grow"
href={document?.metadata?.file_id href={document?.metadata?.type === 'youtube'
? document?.metadata?.source_url
: document?.metadata?.file_id
? `${WEBUI_API_BASE_URL}/files/${document?.metadata?.file_id}/content${document?.metadata?.page !== undefined ? `#page=${document.metadata.page + 1}` : ''}` ? `${WEBUI_API_BASE_URL}/files/${document?.metadata?.file_id}/content${document?.metadata?.page !== undefined ? `#page=${document.metadata.page + 1}` : ''}`
: document.source?.url?.includes('http') : document.source?.url?.includes('http')
? document.source.url ? document.source.url