From 3216e8f9048f24ec37147eeabe7be9c55eb47538 Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Tue, 10 Jun 2025 12:46:21 +0200 Subject: [PATCH] Update youtube.py --- .../open_webui/retrieval/loaders/youtube.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index f5ff3a638..be5e53358 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -40,13 +40,11 @@ def _parse_video_id(url: str) -> Optional[str]: video_id = ids if isinstance(ids, str) else ids[0] else: return None - elif parsed_url.netloc == "youtu.be": - video_id = parsed_url.path.lstrip("/").split("?")[0] else: path = parsed_url.path.lstrip("/") - video_id = path.split("/")[-1].split("?")[0] + video_id = path.split("/")[-1] - if len(video_id) != 11: + if len(video_id) != 11: # Video IDs are 11 characters long return None return video_id @@ -111,19 +109,19 @@ class YoutubeLoader: # Try each language in order of priority for lang in self.language: try: - try: - transcript = transcript_list.find_manually_created_transcript([lang]) - log.debug(f"Found manual transcript for language '{lang}'") - except NoTranscriptFound: - transcript = transcript_list.find_generated_transcript([lang]) - log.debug(f"Found auto-generated transcript for language '{lang}'") - - log.debug(f"Found transcript for language '{lang}'") - try: - transcript_pieces: List[Dict[str, Any]] = transcript.fetch() - except ParseError: - log.debug(f"Empty or invalid transcript for language '{lang}'") - continue + transcript = transcript_list.find_transcript([lang]) + if transcript.is_generated: + log.debug(f"Found generated transcript for language '{lang}'") + try: + transcript = transcript_list.find_manually_created_transcript( + [lang] + ) + log.debug(f"Found manual transcript for language '{lang}'") + except NoTranscriptFound: + log.debug( + f"No manual transcript found for language '{lang}', using generated" + ) + pass log.debug(f"Found transcript for language '{lang}'") try: