mirror of
https://github.com/open-webui/open-webui
synced 2025-01-19 09:16:44 +00:00
feat: Update CitationsModal to use source_url for YouTube documents and file_id for others
This commit is contained in:
parent
ba3fe33ef8
commit
49f462163a
@ -78,6 +78,7 @@ class YoutubeLoader:
|
|||||||
# First try using YouTube Data API v3 if available
|
# First try using YouTube Data API v3 if available
|
||||||
try:
|
try:
|
||||||
from open_webui.config import YOUTUBE_API_KEY
|
from open_webui.config import YOUTUBE_API_KEY
|
||||||
|
|
||||||
if YOUTUBE_API_KEY:
|
if YOUTUBE_API_KEY:
|
||||||
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
|
url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
@ -93,7 +94,8 @@ class YoutubeLoader:
|
|||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
import re
|
import re
|
||||||
title_match = re.search(r'<title>(.+?)</title>', response.text)
|
|
||||||
|
title_match = re.search(r"<title>(.+?)</title>", response.text)
|
||||||
if title_match:
|
if title_match:
|
||||||
title = title_match.group(1)
|
title = title_match.group(1)
|
||||||
return title
|
return title
|
||||||
@ -139,10 +141,14 @@ class YoutubeLoader:
|
|||||||
transcript = transcript_list.find_transcript(self.language)
|
transcript = transcript_list.find_transcript(self.language)
|
||||||
except NoTranscriptFound:
|
except NoTranscriptFound:
|
||||||
# Fallback: try to get any available transcript
|
# Fallback: try to get any available transcript
|
||||||
available_transcripts = list(transcript_list._generated_transcripts.values())
|
available_transcripts = list(
|
||||||
|
transcript_list._generated_transcripts.values()
|
||||||
|
)
|
||||||
if available_transcripts:
|
if available_transcripts:
|
||||||
transcript = available_transcripts[0]
|
transcript = available_transcripts[0]
|
||||||
log.info(f"Using first available transcript in language: {transcript.language_code}")
|
log.info(
|
||||||
|
f"Using first available transcript in language: {transcript.language_code}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
log.error("No transcripts found for video")
|
log.error("No transcripts found for video")
|
||||||
return []
|
return []
|
||||||
@ -168,20 +174,22 @@ class YoutubeLoader:
|
|||||||
full_text += text + " "
|
full_text += text + " "
|
||||||
end_char = len(full_text)
|
end_char = len(full_text)
|
||||||
|
|
||||||
timestamp_map.append({
|
timestamp_map.append(
|
||||||
|
{
|
||||||
"start": start_char,
|
"start": start_char,
|
||||||
"end": end_char,
|
"end": end_char,
|
||||||
"time": piece["start"],
|
"time": piece["start"],
|
||||||
"duration": piece["duration"]
|
"duration": piece["duration"],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Create a single document that will be split by Langchain's text splitter
|
# Create a single document that will be split by Langchain's text splitter
|
||||||
doc = Document(
|
doc = Document(
|
||||||
page_content=full_text.strip(),
|
page_content=full_text.strip(),
|
||||||
metadata={
|
metadata={
|
||||||
**self._metadata,
|
**self._metadata,
|
||||||
"timestamp_map": timestamp_map # Store timestamp mapping in metadata
|
"timestamp_map": timestamp_map, # Store timestamp mapping in metadata
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
return [doc]
|
return [doc]
|
||||||
|
@ -152,15 +152,15 @@ def get_rf(
|
|||||||
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
|
raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
|
||||||
return rf
|
return rf
|
||||||
|
|
||||||
|
|
||||||
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
|
def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
query_dict = parse_qs(parsed.query)
|
query_dict = parse_qs(parsed.query)
|
||||||
query_dict['t'] = [str(timestamp)]
|
query_dict["t"] = [str(timestamp)]
|
||||||
new_query = urlencode(query_dict, doseq=True)
|
new_query = urlencode(query_dict, doseq=True)
|
||||||
return urlunparse(parsed._replace(query=new_query))
|
return urlunparse(parsed._replace(query=new_query))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##########################################
|
##########################################
|
||||||
#
|
#
|
||||||
# API routes
|
# API routes
|
||||||
@ -662,7 +662,9 @@ async def update_query_settings(
|
|||||||
####################################
|
####################################
|
||||||
|
|
||||||
|
|
||||||
def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
|
def interpolate_timestamp(
|
||||||
|
chunk_start: int, chunk_end: int, timestamp_map: List[dict]
|
||||||
|
) -> Tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Find the appropriate timestamp for a chunk based on its character position
|
Find the appropriate timestamp for a chunk based on its character position
|
||||||
Returns (start_time, end_time) as floats in seconds
|
Returns (start_time, end_time) as floats in seconds
|
||||||
@ -675,7 +677,8 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
|
|||||||
else:
|
else:
|
||||||
# If not found, use the closest previous timestamp
|
# If not found, use the closest previous timestamp
|
||||||
start_time = min(
|
start_time = min(
|
||||||
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
|
[e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0
|
||||||
|
)
|
||||||
|
|
||||||
# Find the timestamp entry that contains the end of our chunk
|
# Find the timestamp entry that contains the end of our chunk
|
||||||
for entry in reversed(timestamp_map):
|
for entry in reversed(timestamp_map):
|
||||||
@ -684,11 +687,14 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
# If not found, use the closest next timestamp
|
# If not found, use the closest next timestamp
|
||||||
end_time = max([e["time"] + e["duration"]
|
end_time = max(
|
||||||
for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
|
[e["time"] + e["duration"] for e in timestamp_map if e["end"] >= chunk_end],
|
||||||
|
default=start_time,
|
||||||
|
)
|
||||||
|
|
||||||
return start_time, end_time
|
return start_time, end_time
|
||||||
|
|
||||||
|
|
||||||
def save_docs_to_vector_db(
|
def save_docs_to_vector_db(
|
||||||
request: Request,
|
request: Request,
|
||||||
docs,
|
docs,
|
||||||
@ -733,12 +739,12 @@ def save_docs_to_vector_db(
|
|||||||
|
|
||||||
if split:
|
if split:
|
||||||
# Check if this is a YouTube document by looking at the first doc's metadata
|
# Check if this is a YouTube document by looking at the first doc's metadata
|
||||||
is_youtube = (len(docs) == 1 and
|
is_youtube = len(docs) == 1 and docs[0].metadata.get("type") == "youtube"
|
||||||
docs[0].metadata.get("type") == "youtube")
|
|
||||||
|
|
||||||
# Store timestamp_map before splitting if it's a YouTube document
|
# Store timestamp_map before splitting if it's a YouTube document
|
||||||
original_timestamp_map = docs[0].metadata.get(
|
original_timestamp_map = (
|
||||||
"timestamp_map") if is_youtube else None
|
docs[0].metadata.get("timestamp_map") if is_youtube else None
|
||||||
|
)
|
||||||
|
|
||||||
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
|
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
|
||||||
text_splitter = RecursiveCharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
@ -770,15 +776,17 @@ def save_docs_to_vector_db(
|
|||||||
end_index = start_index + len(doc.page_content)
|
end_index = start_index + len(doc.page_content)
|
||||||
|
|
||||||
start_time, end_time = interpolate_timestamp(
|
start_time, end_time = interpolate_timestamp(
|
||||||
start_index,
|
start_index, end_index, original_timestamp_map
|
||||||
end_index,
|
|
||||||
original_timestamp_map
|
|
||||||
)
|
)
|
||||||
|
|
||||||
doc.metadata.update({
|
doc.metadata.update(
|
||||||
|
{
|
||||||
"start_time": start_time,
|
"start_time": start_time,
|
||||||
"source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
|
"source_url": add_timestamp_to_youtube_url(
|
||||||
})
|
doc.metadata["source_url"], int(start_time)
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Remove the timestamp_map from individual chunks
|
# Remove the timestamp_map from individual chunks
|
||||||
doc.metadata.pop("timestamp_map", None)
|
doc.metadata.pop("timestamp_map", None)
|
||||||
@ -915,10 +923,7 @@ def process_file(
|
|||||||
"status": True,
|
"status": True,
|
||||||
"collection_name": form_data.collection_name,
|
"collection_name": form_data.collection_name,
|
||||||
"content": content,
|
"content": content,
|
||||||
"file": {
|
"file": {"id": file.id, "meta": metadata},
|
||||||
"id": file.id,
|
|
||||||
"meta": metadata
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
collection_name = form_data.collection_name
|
collection_name = form_data.collection_name
|
||||||
@ -1168,11 +1173,9 @@ def process_youtube_video(
|
|||||||
"size": len(content),
|
"size": len(content),
|
||||||
"source": form_data.url,
|
"source": form_data.url,
|
||||||
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
|
"source_url": add_timestamp_to_youtube_url(form_data.url, 0),
|
||||||
"type": "youtube"
|
"type": "youtube",
|
||||||
},
|
},
|
||||||
"data": {
|
"data": {"content": content},
|
||||||
"content": content
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@ -1185,7 +1188,7 @@ def process_youtube_video(
|
|||||||
"type": "youtube",
|
"type": "youtube",
|
||||||
"name": video_title,
|
"name": video_title,
|
||||||
"file_id": file_id,
|
"file_id": file_id,
|
||||||
"created_by": user.id if user else None
|
"created_by": user.id if user else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Update all docs with the file metadata
|
# Update all docs with the file metadata
|
||||||
@ -1194,7 +1197,9 @@ def process_youtube_video(
|
|||||||
# Debug log
|
# Debug log
|
||||||
log.info(f"Document metadata before saving: {doc.metadata}")
|
log.info(f"Document metadata before saving: {doc.metadata}")
|
||||||
|
|
||||||
save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)
|
save_docs_to_vector_db(
|
||||||
|
request, docs, collection_name, overwrite=False, add=True
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
@ -1205,7 +1210,7 @@ def process_youtube_video(
|
|||||||
"data": {
|
"data": {
|
||||||
"content": content,
|
"content": content,
|
||||||
},
|
},
|
||||||
"meta": file_metadata
|
"meta": file_metadata,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -91,7 +91,9 @@
|
|||||||
<div class="text-sm dark:text-gray-400 flex items-center gap-2 w-fit">
|
<div class="text-sm dark:text-gray-400 flex items-center gap-2 w-fit">
|
||||||
<a
|
<a
|
||||||
class="hover:text-gray-500 hover:dark:text-gray-100 underline flex-grow"
|
class="hover:text-gray-500 hover:dark:text-gray-100 underline flex-grow"
|
||||||
href={document?.metadata?.file_id
|
href={document?.metadata?.type === 'youtube'
|
||||||
|
? document?.metadata?.source_url
|
||||||
|
: document?.metadata?.file_id
|
||||||
? `${WEBUI_API_BASE_URL}/files/${document?.metadata?.file_id}/content${document?.metadata?.page !== undefined ? `#page=${document.metadata.page + 1}` : ''}`
|
? `${WEBUI_API_BASE_URL}/files/${document?.metadata?.file_id}/content${document?.metadata?.page !== undefined ? `#page=${document.metadata.page + 1}` : ''}`
|
||||||
: document.source?.url?.includes('http')
|
: document.source?.url?.includes('http')
|
||||||
? document.source.url
|
? document.source.url
|
||||||
|
Loading…
Reference in New Issue
Block a user