feat: Update CitationsModal to use source_url for YouTube documents and file_id for others

2025-01-19 09:16:44 +00:00 · 2024-12-31 12:47:32 +01:00 · 2024-12-31 12:47:32 +01:00 · 49f462163a
commit 49f462163a
parent ba3fe33ef8
8 changed files with 64 additions and 49 deletions
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@ -78,6 +78,7 @@ class YoutubeLoader:
            # First try using YouTube Data API v3 if available
            try:
                from open_webui.config import YOUTUBE_API_KEY
+
                if YOUTUBE_API_KEY:
                    url = f"https://www.googleapis.com/youtube/v3/videos?id={self.video_id}&key={YOUTUBE_API_KEY}&part=snippet"
                    response = requests.get(url)
@ -93,7 +94,8 @@ class YoutubeLoader:
            response = requests.get(url)
            if response.status_code == 200:
                import re
-                title_match = re.search(r'<title>(.+?)</title>', response.text)
+
+                title_match = re.search(r"<title>(.+?)</title>", response.text)
                if title_match:
                    title = title_match.group(1)
                    return title
@ -139,10 +141,14 @@ class YoutubeLoader:
            transcript = transcript_list.find_transcript(self.language)
        except NoTranscriptFound:
            # Fallback: try to get any available transcript
-            available_transcripts = list(transcript_list._generated_transcripts.values())
+            available_transcripts = list(
+                transcript_list._generated_transcripts.values()
+            )
            if available_transcripts:
                transcript = available_transcripts[0]
-                log.info(f"Using first available transcript in language: {transcript.language_code}")
+                log.info(
+                    f"Using first available transcript in language: {transcript.language_code}"
+                )
            else:
                log.error("No transcripts found for video")
                return []
@ -168,20 +174,22 @@ class YoutubeLoader:
            full_text += text + " "
            end_char = len(full_text)

-            timestamp_map.append({
-                "start": start_char,
-                "end": end_char,
-                "time": piece["start"],
-                "duration": piece["duration"]
-            })
+            timestamp_map.append(
+                {
+                    "start": start_char,
+                    "end": end_char,
+                    "time": piece["start"],
+                    "duration": piece["duration"],
+                }
+            )

        # Create a single document that will be split by Langchain's text splitter
        doc = Document(
            page_content=full_text.strip(),
            metadata={
                **self._metadata,
-                "timestamp_map": timestamp_map  # Store timestamp mapping in metadata
-            }
+                "timestamp_map": timestamp_map,  # Store timestamp mapping in metadata
+            },
        )

        return [doc]
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@ -152,15 +152,15 @@ def get_rf(
                raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
    return rf

+
 def add_timestamp_to_youtube_url(url: str, timestamp: int) -> str:
    parsed = urlparse(url)
    query_dict = parse_qs(parsed.query)
-    query_dict['t'] = [str(timestamp)]
+    query_dict["t"] = [str(timestamp)]
    new_query = urlencode(query_dict, doseq=True)
    return urlunparse(parsed._replace(query=new_query))


-
 ##########################################
 #
 # API routes
@ -662,7 +662,9 @@ async def update_query_settings(
 ####################################


-def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[dict]) -> Tuple[float, float]:
+def interpolate_timestamp(
+    chunk_start: int, chunk_end: int, timestamp_map: List[dict]
+) -> Tuple[float, float]:
    """
    Find the appropriate timestamp for a chunk based on its character position
    Returns (start_time, end_time) as floats in seconds
@ -675,7 +677,8 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
    else:
        # If not found, use the closest previous timestamp
        start_time = min(
-            [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0)
+            [e["time"] for e in timestamp_map if e["start"] <= chunk_start], default=0
+        )

    # Find the timestamp entry that contains the end of our chunk
    for entry in reversed(timestamp_map):
@ -684,11 +687,14 @@ def interpolate_timestamp(chunk_start: int, chunk_end: int, timestamp_map: List[
            break
    else:
        # If not found, use the closest next timestamp
-        end_time = max([e["time"] + e["duration"]
-                       for e in timestamp_map if e["end"] >= chunk_end], default=start_time)
+        end_time = max(
+            [e["time"] + e["duration"] for e in timestamp_map if e["end"] >= chunk_end],
+            default=start_time,
+        )

    return start_time, end_time

+
 def save_docs_to_vector_db(
    request: Request,
    docs,
@ -733,12 +739,12 @@ def save_docs_to_vector_db(

    if split:
        # Check if this is a YouTube document by looking at the first doc's metadata
-        is_youtube = (len(docs) == 1 and
-                      docs[0].metadata.get("type") == "youtube")
+        is_youtube = len(docs) == 1 and docs[0].metadata.get("type") == "youtube"

        # Store timestamp_map before splitting if it's a YouTube document
-        original_timestamp_map = docs[0].metadata.get(
-            "timestamp_map") if is_youtube else None
+        original_timestamp_map = (
+            docs[0].metadata.get("timestamp_map") if is_youtube else None
+        )

        if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
            text_splitter = RecursiveCharacterTextSplitter(
@ -770,15 +776,17 @@ def save_docs_to_vector_db(
                end_index = start_index + len(doc.page_content)

                start_time, end_time = interpolate_timestamp(
-                    start_index,
-                    end_index,
-                    original_timestamp_map
+                    start_index, end_index, original_timestamp_map
                )

-                doc.metadata.update({
-                    "start_time": start_time,
-                    "source_url": add_timestamp_to_youtube_url(doc.metadata['source_url'], int(start_time))
-                })
+                doc.metadata.update(
+                    {
+                        "start_time": start_time,
+                        "source_url": add_timestamp_to_youtube_url(
+                            doc.metadata["source_url"], int(start_time)
+                        ),
+                    }
+                )

                # Remove the timestamp_map from individual chunks
                doc.metadata.pop("timestamp_map", None)
@ -915,10 +923,7 @@ def process_file(
                "status": True,
                "collection_name": form_data.collection_name,
                "content": content,
-                "file": {
-                    "id": file.id,
-                    "meta": metadata
-                }
+                "file": {"id": file.id, "meta": metadata},
            }

        collection_name = form_data.collection_name
@ -1148,7 +1153,7 @@ def process_youtube_video(
        content = " ".join([doc.page_content for doc in docs])
        log.debug(f"text_content: {content}")

-                # Get video title from metadata or fallback to URL
+        # Get video title from metadata or fallback to URL
        video_title = docs[0].metadata.get("title", form_data.url)

        # Create a unique file ID for this video
@ -1168,11 +1173,9 @@ def process_youtube_video(
                        "size": len(content),
                        "source": form_data.url,
                        "source_url": add_timestamp_to_youtube_url(form_data.url, 0),
-                        "type": "youtube"
+                        "type": "youtube",
                    },
-                    "data": {
-                        "content": content
-                    }
+                    "data": {"content": content},
                }
            ),
        )
@ -1185,7 +1188,7 @@ def process_youtube_video(
            "type": "youtube",
            "name": video_title,
            "file_id": file_id,
-            "created_by": user.id if user else None
+            "created_by": user.id if user else None,
        }

        # Update all docs with the file metadata
@ -1194,18 +1197,20 @@ def process_youtube_video(
            # Debug log
            log.info(f"Document metadata before saving: {doc.metadata}")

-        save_docs_to_vector_db(request, docs, collection_name, overwrite=False, add=True)
+        save_docs_to_vector_db(
+            request, docs, collection_name, overwrite=False, add=True
+        )

        return {
            "status": True,
            "collection_name": collection_name,
            "id": file_id,  # Return the file ID directly
-             "filename": video_title,
+            "filename": video_title,
            "file": {
                "data": {
                    "content": content,
                },
-                "meta": file_metadata
+                "meta": file_metadata,
            },
        }
    except Exception as e:
--- a/src/lib/components/chat/Messages/CitationsModal.svelte
+++ b/src/lib/components/chat/Messages/CitationsModal.svelte
@ -91,7 +91,9 @@
 								<div class="text-sm dark:text-gray-400 flex items-center gap-2 w-fit">
 									<a
 										class="hover:text-gray-500 hover:dark:text-gray-100 underline flex-grow"
-										href={document?.metadata?.file_id
+										href={document?.metadata?.type === 'youtube'
+										? document?.metadata?.source_url
+										: document?.metadata?.file_id
 											? `${WEBUI_API_BASE_URL}/files/${document?.metadata?.file_id}/content${document?.metadata?.page !== undefined ? `#page=${document.metadata.page + 1}` : ''}`
 											: document.source?.url?.includes('http')
 												? document.source.url