From 7680ac25179aed4d48815e178aa22ac8399c6381 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Mon, 5 May 2025 19:57:06 +0200 Subject: [PATCH] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 107 +++++++++++------- 1 file changed, 63 insertions(+), 44 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index f59dd7df5..337436960 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -70,48 +70,67 @@ class YoutubeLoader: self.language = language def load(self) -> List[Document]: - """Load YouTube transcripts into `Document` objects.""" - try: - from youtube_transcript_api import ( - NoTranscriptFound, - TranscriptsDisabled, - YouTubeTranscriptApi, - ) - except ImportError: - raise ImportError( - 'Could not import "youtube_transcript_api" Python package. ' - "Please install it with `pip install youtube-transcript-api`." - ) - - if self.proxy_url: - youtube_proxies = { - "http": self.proxy_url, - "https": self.proxy_url, - } - # Don't log complete URL because it might contain secrets - log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") - else: - youtube_proxies = None - - try: - transcript_list = YouTubeTranscriptApi.list_transcripts( - self.video_id, proxies=youtube_proxies - ) - except Exception as e: - log.exception("Loading YouTube transcript failed") - return [] - - try: - transcript = transcript_list.find_transcript(self.language) - except NoTranscriptFound: - transcript = transcript_list.find_transcript(["en"]) - - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - - transcript = " ".join( - map( - lambda transcript_piece: transcript_piece.text.strip(" "), - transcript_pieces, - ) + """Load YouTube transcripts into `Document` objects.""" + try: + from youtube_transcript_api import ( + NoTranscriptFound, + TranscriptsDisabled, + YouTubeTranscriptApi, ) - return [Document(page_content=transcript, metadata=self._metadata)] + except ImportError: + raise ImportError( + 'Could not import "youtube_transcript_api" Python package. ' + "Please install it with `pip install youtube-transcript-api`." + ) + + if self.proxy_url: + youtube_proxies = { + "http": self.proxy_url, + "https": self.proxy_url, + } + # Don't log complete URL because it might contain secrets + log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") + else: + youtube_proxies = None + + try: + transcript_list = YouTubeTranscriptApi.list_transcripts( + self.video_id, proxies=youtube_proxies + ) + except Exception as e: + log.exception("Loading YouTube transcript failed") + return [] + + # Try each language in order of priority + last_exception = None + for lang in self.language: + try: + log.debug(f"Attempting to find transcript for language '{lang}'") + transcript = transcript_list.find_transcript([lang]) + log.info(f"Found transcript for language '{lang}'") + + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + transcript_text = " ".join( + map( + lambda transcript_piece: transcript_piece.text.strip(" "), + transcript_pieces, + ) + ) + return [Document(page_content=transcript_text, metadata=self._metadata)] + except NoTranscriptFound as e: + log.debug(f"No transcript found for language '{lang}'") + last_exception = e + continue + except Exception as e: + # If we hit any other type of exception, log it and re-raise + log.exception(f"Error finding transcript for language '{lang}'") + raise e + + # If all specified languages fail, raise the last exception + # This maintains compatibility with the error handling in the rest of the application + if last_exception: + log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") + raise last_exception + + # This should never happen (we'd have raised an exception above) + return []