mirror of
https://github.com/open-webui/open-webui
synced 2025-01-19 01:06:45 +00:00
feat: Update CitationsModal to use source_url for YouTube documents and file_id for others
This commit is contained in:
parent
ba3fe33ef8
commit
49f462163a
@ -78,6 +78,7 @@ class YoutubeLoader:
|
||||
# First try using YouTube Data API v3 if available
|
||||
try:
|
||||
from open_webui.config import YOUTUBE_API_KEY
|
||||
|
||||
if YOUTUBE_API_KEY:
|
||||
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
|
||||
response = requests.get(url)
|
||||
@ -93,7 +94,8 @@ class YoutubeLoader:
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
import re
|
||||
title_match = re.search(r'<title>(.+?)</title>', response.text)
|
||||
|
||||
title_match = re.search(r"<title>(.+?)</title>", response.text)
|
||||
if title_match:
|
||||
title = title_match.group(1)
|
||||
return title
|
||||
@ -139,10 +141,14 @@ class YoutubeLoader:
|
||||
transcript = transcript_list.find_transcript(self.language)
|
||||
except NoTranscriptFound:
|
||||
# Fallback: try to get any available transcript
|
||||
available_transcripts = list(transcript_list._generated_transcripts.values())
|
||||
available_transcripts = list(
|
||||
transcript_list._generated_transcripts.values()
|
||||
)
|
||||
if available_transcripts:
|
||||
transcript = available_transcripts[0]
|
||||
log.info(f"Using first available transcript in language: {transcript.language_code}")
|
||||
log.info(
|
||||
f"Using first available transcript in language: {transcript.language_code}"
|
||||
)
|
||||
else:
|
||||
log.error("No transcripts found for video")
|
||||
return []
|
||||
@ -161,27 +167,29 @@ class YoutubeLoader:
|
||||
# Combine pieces into a single text while tracking timestamp positions
|
||||
full_text = ""
|
||||
timestamp_map = []
|
||||
|
||||
|
||||
for piece in transcript_pieces:
|
||||
start_char = len(full_text)
|
||||
text = piece["text"].strip()
|
||||
full_text += text + " "
|
||||
end_char = len(full_text)
|
||||
|
||||
timestamp_map.append({
|
||||
"start": start_char,
|
||||
"end": end_char,
|
||||
"time": piece["start"],
|
||||
"duration": piece["duration"]
|
||||
})
|
||||
|
||||
timestamp_map.append(
|
||||
{
|
||||
"start": start_char,
|
||||
"end": end_char,
|
||||
"time": piece["start"],
|
||||
"duration": piece["duration"],
|
||||
}
|
||||
)
|
||||
|
||||
# Create a single document that will be split by Langchain's text splitter
|
||||
doc = Document(
|
||||
page_content=full_text.strip(),
|
||||
metadata={
|
||||
**self._metadata,
|
||||
"timestamp_map": timestamp_map # Store timestamp mapping in metadata
|
||||
}
|
||||
"timestamp_map": timestamp_map, # Store timestamp mapping in metadata
|
||||
},
|
||||
)
|
||||
|
||||
return [doc]
|
||||
|
@ -152,15 +152,15 @@ def get_rf(
|
||||
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
|
||||
return rf
|
||||
|
||||
|
||||
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
|
||||
parsed = urlparse(url)
|
||||
query_dict = parse_qs(parsed.query)
|
||||
query_dict['t'] = [str(timestamp)]
|
||||
query_dict["t"] = [str(timestamp)]
|
||||
new_query = urlencode(query_dict, doseq=True)
|
||||
return urlunparse(parsed._replace(query=new_query))
|
||||
|
||||
|
||||
|
||||
##########################################
|
||||
#
|
||||
# API routes
|
||||
@ -662,7 +662,9 @@ async def update_query_settings(
|
||||
####################################
|
||||
|
||||
|
||||
def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
|
||||
def interpolate_timestamp(
|
||||
chunk_start: int, chunk_end: int, timestamp_map: List[dict]
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
Find the appropriate timestamp for a chunk based on its character position
|
||||
Returns (start_time, end_time) as floats in seconds
|
||||
@ -675,7 +677,8 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
|
||||
else:
|
||||
# If not found, use the closest previous timestamp
|
||||
start_time = min(
|
||||
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
|
||||
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0
|
||||
)
|
||||
|
||||
# Find the timestamp entry that contains the end of our chunk
|
||||
for entry in reversed(timestamp_map):
|
||||
@ -684,11 +687,14 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
|
||||
break
|
||||
else:
|
||||
# If not found, use the closest next timestamp
|
||||
end_time = max([e["time"] + e["duration"]
|
||||
for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
|
||||
end_time = max(
|
||||
[e["time"] + e["duration"] for e in timestamp_map if e["end"] >= chunk_end],
|
||||
default=start_time,
|
||||
)
|
||||
|
||||
return start_time, end_time
|
||||
|
||||
|
||||
def save_docs_to_vector_db(
|
||||
request: Request,
|
||||
docs,
|
||||
@ -733,12 +739,12 @@ def save_docs_to_vector_db(
|
||||
|
||||
if split:
|
||||
# Check if this is a YouTube document by looking at the first doc's metadata
|
||||
is_youtube = (len(docs) == 1 and
|
||||
docs[0].metadata.get("type") == "youtube")
|
||||
is_youtube = len(docs) == 1 and docs[0].metadata.get("type") == "youtube"
|
||||
|
||||
# Store timestamp_map before splitting if it's a YouTube document
|
||||
original_timestamp_map = docs[0].metadata.get(
|
||||
"timestamp_map") if is_youtube else None
|
||||
original_timestamp_map = (
|
||||
docs[0].metadata.get("timestamp_map") if is_youtube else None
|
||||
)
|
||||
|
||||
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
@ -770,15 +776,17 @@ def save_docs_to_vector_db(
|
||||
end_index = start_index + len(doc.page_content)
|
||||
|
||||
start_time, end_time = interpolate_timestamp(
|
||||
start_index,
|
||||
end_index,
|
||||
original_timestamp_map
|
||||
start_index, end_index, original_timestamp_map
|
||||
)
|
||||
|
||||
doc.metadata.update({
|
||||
"start_time": start_time,
|
||||
"source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
|
||||
})
|
||||
doc.metadata.update(
|
||||
{
|
||||
"start_time": start_time,
|
||||
"source_url": add_timestamp_to_youtube_url(
|
||||
doc.metadata["source_url"], int(start_time)
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
# Remove the timestamp_map from individual chunks
|
||||
doc.metadata.pop("timestamp_map", None)
|
||||
@ -915,10 +923,7 @@ def process_file(
|
||||
"status": True,
|
||||
"collection_name": form_data.collection_name,
|
||||
"content": content,
|
||||
"file": {
|
||||
"id": file.id,
|
||||
"meta": metadata
|
||||
}
|
||||
"file": {"id": file.id, "meta": metadata},
|
||||
}
|
||||
|
||||
collection_name = form_data.collection_name
|
||||
@ -1148,7 +1153,7 @@ def process_youtube_video(
|
||||
content = " ".join([doc.page_content for doc in docs])
|
||||
log.debug(f"text_content: {content}")
|
||||
|
||||
# Get video title from metadata or fallback to URL
|
||||
# Get video title from metadata or fallback to URL
|
||||
video_title = docs[0].metadata.get("title", form_data.url)
|
||||
|
||||
# Create a unique file ID for this video
|
||||
@ -1168,11 +1173,9 @@ def process_youtube_video(
|
||||
"size": len(content),
|
||||
"source": form_data.url,
|
||||
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
|
||||
"type": "youtube"
|
||||
"type": "youtube",
|
||||
},
|
||||
"data": {
|
||||
"content": content
|
||||
}
|
||||
"data": {"content": content},
|
||||
}
|
||||
),
|
||||
)
|
||||
@ -1185,7 +1188,7 @@ def process_youtube_video(
|
||||
"type": "youtube",
|
||||
"name": video_title,
|
||||
"file_id": file_id,
|
||||
"created_by": user.id if user else None
|
||||
"created_by": user.id if user else None,
|
||||
}
|
||||
|
||||
# Update all docs with the file metadata
|
||||
@ -1194,18 +1197,20 @@ def process_youtube_video(
|
||||
# Debug log
|
||||
log.info(f"Document metadata before saving: {doc.metadata}")
|
||||
|
||||
save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)
|
||||
save_docs_to_vector_db(
|
||||
request, docs, collection_name, overwrite=False, add=True
|
||||
)
|
||||
|
||||
return {
|
||||
"status": True,
|
||||
"collection_name": collection_name,
|
||||
"id": file_id, # Return the file ID directly
|
||||
"filename": video_title,
|
||||
"filename": video_title,
|
||||
"file": {
|
||||
"data": {
|
||||
"content": content,
|
||||
},
|
||||
"meta": file_metadata
|
||||
"meta": file_metadata,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
|
2
package-lock.json
generated
2
package-lock.json
generated
@ -12589,4 +12589,4 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -110,4 +110,4 @@
|
||||
"node": ">=18.13.0 <=22.x.x",
|
||||
"npm": ">=6.0.0"
|
||||
}
|
||||
}
|
||||
}
|
@ -91,7 +91,9 @@
|
||||
<div class="text-sm dark:text-gray-400 flex items-center gap-2 w-fit">
|
||||
<a
|
||||
class="hover:text-gray-500 hover:dark:text-gray-100 underline flex-grow"
|
||||
href={document?.metadata?.file_id
|
||||
href={document?.metadata?.type === 'youtube'
|
||||
? document?.metadata?.source_url
|
||||
: document?.metadata?.file_id
|
||||
? `${WEBUI_API_BASE_URL}/files/${document?.metadata?.file_id}/content${document?.metadata?.page !== undefined ? `#page=${document.metadata.page + 1}` : ''}`
|
||||
: document.source?.url?.includes('http')
|
||||
? document.source.url
|
||||
|
@ -1062,4 +1062,4 @@
|
||||
"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "L'intégralité de votre contribution ira directement au développeur du plugin ; Open WebUI ne prend aucun pourcentage. Cependant, la plateforme de financement choisie peut avoir ses propres frais.",
|
||||
"Youtube": "YouTube",
|
||||
"Youtube Loader Settings": "Paramètres de l'outil de téléchargement YouTube"
|
||||
}
|
||||
}
|
@ -1062,4 +1062,4 @@
|
||||
"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "Rachaidh do ranníocaíocht iomlán go díreach chuig an bhforbróir breiseán; Ní ghlacann Open WebUI aon chéatadán. Mar sin féin, d'fhéadfadh a tháillí féin a bheith ag an ardán maoinithe roghnaithe.",
|
||||
"Youtube": "Youtube",
|
||||
"Youtube Loader Settings": "Socruithe Luchtaire Youtube"
|
||||
}
|
||||
}
|
@ -1062,4 +1062,4 @@
|
||||
"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "您的全部捐款将直接给到插件开发者,Open WebUI 不会收取任何比例。但众筹平台可能会有服务费、抽成。",
|
||||
"Youtube": "YouTube",
|
||||
"Youtube Loader Settings": "YouTube 爬取设置"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user