diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index d908cc8cb..eb4ed9a75 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -1,4 +1,5 @@ import logging +from xml.etree.ElementTree import ParseError from typing import Any, Dict, Generator, List, Optional, Sequence, Union from urllib.parse import parse_qs, urlparse @@ -93,7 +94,6 @@ class YoutubeLoader: "http": self.proxy_url, "https": self.proxy_url, } - # Don't log complete URL because it might contain secrets log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") else: youtube_proxies = None @@ -110,11 +110,35 @@ class YoutubeLoader: for lang in self.language: try: transcript = transcript_list.find_transcript([lang]) + if transcript.is_generated: + log.debug(f"Found generated transcript for language '{lang}'") + try: + transcript = transcript_list.find_manually_created_transcript( + [lang] + ) + log.debug(f"Found manual transcript for language '{lang}'") + except NoTranscriptFound: + log.debug( + f"No manual transcript found for language '{lang}', using generated" + ) + pass + log.debug(f"Found transcript for language '{lang}'") - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + try: + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + except ParseError: + log.debug(f"Empty or invalid transcript for language '{lang}'") + continue + + if not transcript_pieces: + log.debug(f"Empty transcript for language '{lang}'") + continue + transcript_text = " ".join( map( - lambda transcript_piece: transcript_piece.text.strip(" "), + lambda transcript_piece: transcript_piece.text.strip(" ") + if hasattr(transcript_piece, "text") + else "", transcript_pieces, ) ) @@ -131,6 +155,4 @@ class YoutubeLoader: log.warning( f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed." ) - raise NoTranscriptFound( - f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed." - ) + raise NoTranscriptFound(self.video_id, self.language, list(transcript_list))