Merge pull request #14781 from lucyknada/patch-2

fix: fix #14752 and add manual transcription retrieval
2025-06-14 10:20:52 +00:00 · 2025-06-08 18:40:28 +04:00 · 2025-06-08 18:40:28 +04:00 · 50d9a2ac58
commit 50d9a2ac58
parent 0b1c81791e b0965a8184
1 changed files with 28 additions and 6 deletions
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@ -1,4 +1,5 @@
 import logging
 from xml.etree.ElementTree import ParseError
 from typing import Any, Dict, Generator, List, Optional, Sequence, Union
 from urllib.parse import parse_qs, urlparse
@ -93,7 +94,6 @@ class YoutubeLoader:
                "http": self.proxy_url,
                "https": self.proxy_url,
            }
            # Don't log complete URL because it might contain secrets
            log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
        else:
            youtube_proxies = None
@ -110,11 +110,35 @@ class YoutubeLoader:
        for lang in self.language:
            try:
                transcript = transcript_list.find_transcript([lang])
                if transcript.is_generated:
                    log.debug(f"Found generated transcript for language '{lang}'")
                    try:
                        transcript = transcript_list.find_manually_created_transcript(
                            [lang]
                        )
                        log.debug(f"Found manual transcript for language '{lang}'")
                    except NoTranscriptFound:
                        log.debug(
                            f"No manual transcript found for language '{lang}', using generated"
                        )
                        pass
                log.debug(f"Found transcript for language '{lang}'")
                try:
                    transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
                except ParseError:
                    log.debug(f"Empty or invalid transcript for language '{lang}'")
                    continue
                if not transcript_pieces:
                    log.debug(f"Empty transcript for language '{lang}'")
                    continue
                transcript_text = " ".join(
                    map(
-                        lambda transcript_piece: transcript_piece.text.strip(" "),
+                        lambda transcript_piece: transcript_piece.text.strip(" ")
                        if hasattr(transcript_piece, "text")
                        else "",
                        transcript_pieces,
                    )
                )
@ -131,6 +155,4 @@ class YoutubeLoader:
        log.warning(
            f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed."
        )
-        raise NoTranscriptFound(
+        raise NoTranscriptFound(self.video_id, self.language, list(transcript_list))
            f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed."
        )