diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index f59dd7df5..763d73094 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -62,12 +62,17 @@ class YoutubeLoader: _video_id = _parse_video_id(video_id) self.video_id = _video_id if _video_id is not None else video_id self._metadata = {"source": video_id} - self.language = language self.proxy_url = proxy_url + + # Ensure language is a list if isinstance(language, str): self.language = [language] else: - self.language = language + self.language = list(language) + + # Add English as fallback if not already in the list + if "en" not in self.language: + self.language.append("en") def load(self) -> List[Document]: """Load YouTube transcripts into `Document` objects.""" @@ -82,7 +87,7 @@ class YoutubeLoader: 'Could not import "youtube_transcript_api" Python package. ' "Please install it with `pip install youtube-transcript-api`." ) - + if self.proxy_url: youtube_proxies = { "http": self.proxy_url, @@ -92,7 +97,7 @@ class YoutubeLoader: log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") else: youtube_proxies = None - + try: transcript_list = YouTubeTranscriptApi.list_transcripts( self.video_id, proxies=youtube_proxies @@ -100,18 +105,28 @@ class YoutubeLoader: except Exception as e: log.exception("Loading YouTube transcript failed") return [] - - try: - transcript = transcript_list.find_transcript(self.language) - except NoTranscriptFound: - transcript = transcript_list.find_transcript(["en"]) - - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - - transcript = " ".join( - map( - lambda transcript_piece: transcript_piece.text.strip(" "), - transcript_pieces, - ) - ) - return [Document(page_content=transcript, metadata=self._metadata)] + + # Try each language in order of priority + for lang in self.language: + try: + transcript = transcript_list.find_transcript([lang]) + log.debug(f"Found transcript for language '{lang}'") + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + transcript_text = " ".join( + map( + lambda transcript_piece: transcript_piece.text.strip(" "), + transcript_pieces, + ) + ) + return [Document(page_content=transcript_text, metadata=self._metadata)] + except NoTranscriptFound: + log.debug(f"No transcript found for language '{lang}'") + continue + except Exception as e: + log.info(f"Error finding transcript for language '{lang}'") + raise e + + # If we get here, all languages failed + languages_tried = ", ".join(self.language) + log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.") + raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")