From 17e100661cd2acbab9fe56cb7e56313739e25673 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 10 Jun 2025 12:40:28 +0200 Subject: [PATCH] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index be5e53358..f5ff3a638 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -40,11 +40,13 @@ def _parse_video_id(url: str) -> Optional[str]: video_id = ids if isinstance(ids, str) else ids[0] else: return None + elif parsed_url.netloc == "youtu.be": + video_id = parsed_url.path.lstrip("/").split("?")[0] else: path = parsed_url.path.lstrip("/") - video_id = path.split("/")[-1] + video_id = path.split("/")[-1].split("?")[0] - if len(video_id) != 11: # Video IDs are 11 characters long + if len(video_id) != 11: return None return video_id @@ -109,19 +111,19 @@ class YoutubeLoader: # Try each language in order of priority for lang in self.language: try: - transcript = transcript_list.find_transcript([lang]) - if transcript.is_generated: - log.debug(f"Found generated transcript for language '{lang}'") - try: - transcript = transcript_list.find_manually_created_transcript( - [lang] - ) - log.debug(f"Found manual transcript for language '{lang}'") - except NoTranscriptFound: - log.debug( - f"No manual transcript found for language '{lang}', using generated" - ) - pass + try: + transcript = transcript_list.find_manually_created_transcript([lang]) + log.debug(f"Found manual transcript for language '{lang}'") + except NoTranscriptFound: + transcript = transcript_list.find_generated_transcript([lang]) + log.debug(f"Found auto-generated transcript for language '{lang}'") + + log.debug(f"Found transcript for language '{lang}'") + try: + transcript_pieces: List[Dict[str, Any]] = transcript.fetch() + except ParseError: + log.debug(f"Empty or invalid transcript for language '{lang}'") + continue log.debug(f"Found transcript for language '{lang}'") try: