From 7680ac25179aed4d48815e178aa22ac8399c6381 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 19:57:06 +0200
Subject: [PATCH 01/15] Update youtube.py
---
.../open_webui/retrieval/loaders/youtube.py | 107 +++++++++++-------
1 file changed, 63 insertions(+), 44 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index f59dd7df5..337436960 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -70,48 +70,67 @@ class YoutubeLoader:
self.language = language
def load(self) -> List[Document]:
- """Load YouTube transcripts into `Document` objects."""
- try:
- from youtube_transcript_api import (
- NoTranscriptFound,
- TranscriptsDisabled,
- YouTubeTranscriptApi,
- )
- except ImportError:
- raise ImportError(
- 'Could not import "youtube_transcript_api" Python package. '
- "Please install it with `pip install youtube-transcript-api`."
- )
-
- if self.proxy_url:
- youtube_proxies = {
- "http": self.proxy_url,
- "https": self.proxy_url,
- }
- # Don't log complete URL because it might contain secrets
- log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
- else:
- youtube_proxies = None
-
- try:
- transcript_list = YouTubeTranscriptApi.list_transcripts(
- self.video_id, proxies=youtube_proxies
- )
- except Exception as e:
- log.exception("Loading YouTube transcript failed")
- return []
-
- try:
- transcript = transcript_list.find_transcript(self.language)
- except NoTranscriptFound:
- transcript = transcript_list.find_transcript(["en"])
-
- transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
-
- transcript = " ".join(
- map(
- lambda transcript_piece: transcript_piece.text.strip(" "),
- transcript_pieces,
- )
+ """Load YouTube transcripts into `Document` objects."""
+ try:
+ from youtube_transcript_api import (
+ NoTranscriptFound,
+ TranscriptsDisabled,
+ YouTubeTranscriptApi,
)
- return [Document(page_content=transcript, metadata=self._metadata)]
+ except ImportError:
+ raise ImportError(
+ 'Could not import "youtube_transcript_api" Python package. '
+ "Please install it with `pip install youtube-transcript-api`."
+ )
+
+ if self.proxy_url:
+ youtube_proxies = {
+ "http": self.proxy_url,
+ "https": self.proxy_url,
+ }
+ # Don't log complete URL because it might contain secrets
+ log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
+ else:
+ youtube_proxies = None
+
+ try:
+ transcript_list = YouTubeTranscriptApi.list_transcripts(
+ self.video_id, proxies=youtube_proxies
+ )
+ except Exception as e:
+ log.exception("Loading YouTube transcript failed")
+ return []
+
+ # Try each language in order of priority
+ last_exception = None
+ for lang in self.language:
+ try:
+ log.debug(f"Attempting to find transcript for language '{lang}'")
+ transcript = transcript_list.find_transcript([lang])
+ log.info(f"Found transcript for language '{lang}'")
+
+ transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+ transcript_text = " ".join(
+ map(
+ lambda transcript_piece: transcript_piece.text.strip(" "),
+ transcript_pieces,
+ )
+ )
+ return [Document(page_content=transcript_text, metadata=self._metadata)]
+ except NoTranscriptFound as e:
+ log.debug(f"No transcript found for language '{lang}'")
+ last_exception = e
+ continue
+ except Exception as e:
+ # If we hit any other type of exception, log it and re-raise
+ log.exception(f"Error finding transcript for language '{lang}'")
+ raise e
+
+ # If all specified languages fail, raise the last exception
+ # This maintains compatibility with the error handling in the rest of the application
+ if last_exception:
+ log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
+ raise last_exception
+
+ # This should never happen (we'd have raised an exception above)
+ return []
From 0a845db8eca7554d6310b7fad4d7360e2db66b91 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 19:57:21 +0200
Subject: [PATCH 02/15] Update youtube.py
---
.../open_webui/retrieval/loaders/youtube.py | 122 +++++++++---------
1 file changed, 61 insertions(+), 61 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 337436960..c1c8669f1 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -70,67 +70,67 @@ class YoutubeLoader:
self.language = language
def load(self) -> List[Document]:
- """Load YouTube transcripts into `Document` objects."""
- try:
- from youtube_transcript_api import (
- NoTranscriptFound,
- TranscriptsDisabled,
- YouTubeTranscriptApi,
- )
- except ImportError:
- raise ImportError(
- 'Could not import "youtube_transcript_api" Python package. '
- "Please install it with `pip install youtube-transcript-api`."
- )
-
- if self.proxy_url:
- youtube_proxies = {
- "http": self.proxy_url,
- "https": self.proxy_url,
- }
- # Don't log complete URL because it might contain secrets
- log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
- else:
- youtube_proxies = None
-
- try:
- transcript_list = YouTubeTranscriptApi.list_transcripts(
- self.video_id, proxies=youtube_proxies
- )
- except Exception as e:
- log.exception("Loading YouTube transcript failed")
- return []
-
- # Try each language in order of priority
- last_exception = None
- for lang in self.language:
+ """Load YouTube transcripts into `Document` objects."""
try:
- log.debug(f"Attempting to find transcript for language '{lang}'")
- transcript = transcript_list.find_transcript([lang])
- log.info(f"Found transcript for language '{lang}'")
-
- transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
- transcript_text = " ".join(
- map(
- lambda transcript_piece: transcript_piece.text.strip(" "),
- transcript_pieces,
- )
+ from youtube_transcript_api import (
+ NoTranscriptFound,
+ TranscriptsDisabled,
+ YouTubeTranscriptApi,
+ )
+ except ImportError:
+ raise ImportError(
+ 'Could not import "youtube_transcript_api" Python package. '
+ "Please install it with `pip install youtube-transcript-api`."
)
- return [Document(page_content=transcript_text, metadata=self._metadata)]
- except NoTranscriptFound as e:
- log.debug(f"No transcript found for language '{lang}'")
- last_exception = e
- continue
- except Exception as e:
- # If we hit any other type of exception, log it and re-raise
- log.exception(f"Error finding transcript for language '{lang}'")
- raise e
-
- # If all specified languages fail, raise the last exception
- # This maintains compatibility with the error handling in the rest of the application
- if last_exception:
- log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
- raise last_exception
- # This should never happen (we'd have raised an exception above)
- return []
+ if self.proxy_url:
+ youtube_proxies = {
+ "http": self.proxy_url,
+ "https": self.proxy_url,
+ }
+ # Don't log complete URL because it might contain secrets
+ log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
+ else:
+ youtube_proxies = None
+
+ try:
+ transcript_list = YouTubeTranscriptApi.list_transcripts(
+ self.video_id, proxies=youtube_proxies
+ )
+ except Exception as e:
+ log.exception("Loading YouTube transcript failed")
+ return []
+
+ # Try each language in order of priority
+ last_exception = None
+ for lang in self.language:
+ try:
+ log.debug(f"Attempting to find transcript for language '{lang}'")
+ transcript = transcript_list.find_transcript([lang])
+ log.info(f"Found transcript for language '{lang}'")
+
+ transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+ transcript_text = " ".join(
+ map(
+ lambda transcript_piece: transcript_piece.text.strip(" "),
+ transcript_pieces,
+ )
+ )
+ return [Document(page_content=transcript_text, metadata=self._metadata)]
+ except NoTranscriptFound as e:
+ log.debug(f"No transcript found for language '{lang}'")
+ last_exception = e
+ continue
+ except Exception as e:
+ # If we hit any other type of exception, log it and re-raise
+ log.exception(f"Error finding transcript for language '{lang}'")
+ raise e
+
+ # If all specified languages fail, raise the last exception
+ # This maintains compatibility with the error handling in the rest of the application
+ if last_exception:
+ log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
+ raise last_exception
+
+ # This should never happen (we'd have raised an exception above)
+ return []
From 0a3817ed860b2f1d1db190ec6a539b037d1f0701 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 20:00:10 +0200
Subject: [PATCH 03/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index c1c8669f1..48347aa09 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -88,8 +88,7 @@ class YoutubeLoader:
"http": self.proxy_url,
"https": self.proxy_url,
}
- # Don't log complete URL because it might contain secrets
- log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
+ log.debug(f"Using proxy URL: {self.proxy_url}...")
else:
youtube_proxies = None
@@ -105,9 +104,8 @@ class YoutubeLoader:
last_exception = None
for lang in self.language:
try:
- log.debug(f"Attempting to find transcript for language '{lang}'")
transcript = transcript_list.find_transcript([lang])
- log.info(f"Found transcript for language '{lang}'")
+ log.debug(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
@@ -127,10 +125,8 @@ class YoutubeLoader:
raise e
# If all specified languages fail, raise the last exception
- # This maintains compatibility with the error handling in the rest of the application
if last_exception:
log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
raise last_exception
- # This should never happen (we'd have raised an exception above)
return []
From 1a30b3746ed05e9888b038e025075b6e1c17767a Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 20:03:00 +0200
Subject: [PATCH 04/15] Update youtube.py
---
.../open_webui/retrieval/loaders/youtube.py | 38 ++++++++++++++-----
1 file changed, 28 insertions(+), 10 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 48347aa09..0bd286bca 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -88,7 +88,8 @@ class YoutubeLoader:
"http": self.proxy_url,
"https": self.proxy_url,
}
- log.debug(f"Using proxy URL: {self.proxy_url}...")
+ # Don't log complete URL because it might contain secrets
+ log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
@@ -101,12 +102,10 @@ class YoutubeLoader:
return []
# Try each language in order of priority
- last_exception = None
for lang in self.language:
try:
transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'")
-
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
map(
@@ -115,18 +114,37 @@ class YoutubeLoader:
)
)
return [Document(page_content=transcript_text, metadata=self._metadata)]
- except NoTranscriptFound as e:
+ except NoTranscriptFound:
log.debug(f"No transcript found for language '{lang}'")
- last_exception = e
continue
except Exception as e:
# If we hit any other type of exception, log it and re-raise
log.exception(f"Error finding transcript for language '{lang}'")
raise e
- # If all specified languages fail, raise the last exception
- if last_exception:
- log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
- raise last_exception
+ # If all specified languages fail, fall back to English (unless English was already tried)
+ if "en" not in self.language:
+ try:
+ log.debug("Falling back to English transcript")
+ transcript = transcript_list.find_transcript(["en"])
+ transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+ transcript_text = " ".join(
+ map(
+ lambda transcript_piece: transcript_piece.text.strip(" "),
+ transcript_pieces,
+ )
+ )
+ return [Document(page_content=transcript_text, metadata=self._metadata)]
+ except NoTranscriptFound:
+ log.warning("No English transcript found as fallback")
+ except Exception as e:
+ log.exception("Error finding English transcript fallback")
+ raise e
- return []
+ # If we get here, all languages failed including the English fallback
+ languages_tried = ", ".join(self.language)
+ if "en" not in self.language:
+ languages_tried += ", en (fallback)"
+
+ log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
+ raise NoTranscriptFound(f"No transcript found for any supported language")
From b0d74a59f14d8f9c8fbe6aa2676039523a45ef62 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 20:07:37 +0200
Subject: [PATCH 05/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 0bd286bca..958dcfd61 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -118,8 +118,7 @@ class YoutubeLoader:
log.debug(f"No transcript found for language '{lang}'")
continue
except Exception as e:
- # If we hit any other type of exception, log it and re-raise
- log.exception(f"Error finding transcript for language '{lang}'")
+ log.warning(f"Error finding transcript for language '{lang}'")
raise e
# If all specified languages fail, fall back to English (unless English was already tried)
@@ -141,7 +140,7 @@ class YoutubeLoader:
log.exception("Error finding English transcript fallback")
raise e
- # If we get here, all languages failed including the English fallback
+ # All languages failed
languages_tried = ", ".join(self.language)
if "en" not in self.language:
languages_tried += ", en (fallback)"
From 9cf33813813f92dc97ce33c4b89e79dcdc3f3a13 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 20:07:52 +0200
Subject: [PATCH 06/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 958dcfd61..ea8983b31 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -88,8 +88,7 @@ class YoutubeLoader:
"http": self.proxy_url,
"https": self.proxy_url,
}
- # Don't log complete URL because it might contain secrets
- log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
+ log.debug(f"Using proxy URL: {self.proxy_url}...")
else:
youtube_proxies = None
From 791dd24ace6054d1822c4ad76f272c3228337d8c Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 20:08:25 +0200
Subject: [PATCH 07/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index ea8983b31..958dcfd61 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -88,7 +88,8 @@ class YoutubeLoader:
"http": self.proxy_url,
"https": self.proxy_url,
}
- log.debug(f"Using proxy URL: {self.proxy_url}...")
+ # Don't log complete URL because it might contain secrets
+ log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
From 67a612fe2404edd7819717005981070339043932 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Mon, 5 May 2025 20:40:48 +0200
Subject: [PATCH 08/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 958dcfd61..1f78131e2 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -118,7 +118,7 @@ class YoutubeLoader:
log.debug(f"No transcript found for language '{lang}'")
continue
except Exception as e:
- log.warning(f"Error finding transcript for language '{lang}'")
+ log.info(f"Error finding transcript for language '{lang}'")
raise e
# If all specified languages fail, fall back to English (unless English was already tried)
From 5e1cb76b93ea3b632ca0ddf9cbe308fd8ecd1d4d Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 16:16:58 +0200
Subject: [PATCH 09/15] Update youtube.py
---
.../open_webui/retrieval/loaders/youtube.py | 36 ++++++-------------
1 file changed, 11 insertions(+), 25 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 1f78131e2..67b3715fd 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -101,8 +101,16 @@ class YoutubeLoader:
log.exception("Loading YouTube transcript failed")
return []
+ # Make a copy of the language list to avoid modifying the original
+ languages_to_try = list(self.language)
+
+ # Add English as fallback, if not already in the list
+ if "en" not in languages_to_try:
+ log.debug("Adding English as fallback language")
+ languages_to_try.append("en")
+
# Try each language in order of priority
- for lang in self.language:
+ for lang in languages_to_try:
try:
transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'")
@@ -120,30 +128,8 @@ class YoutubeLoader:
except Exception as e:
log.info(f"Error finding transcript for language '{lang}'")
raise e
-
- # If all specified languages fail, fall back to English (unless English was already tried)
- if "en" not in self.language:
- try:
- log.debug("Falling back to English transcript")
- transcript = transcript_list.find_transcript(["en"])
- transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
- transcript_text = " ".join(
- map(
- lambda transcript_piece: transcript_piece.text.strip(" "),
- transcript_pieces,
- )
- )
- return [Document(page_content=transcript_text, metadata=self._metadata)]
- except NoTranscriptFound:
- log.warning("No English transcript found as fallback")
- except Exception as e:
- log.exception("Error finding English transcript fallback")
- raise e
-
- # All languages failed
- languages_tried = ", ".join(self.language)
- if "en" not in self.language:
- languages_tried += ", en (fallback)"
+ # If we get here, all languages failed including the English fallback
+ languages_tried = ", ".join(languages_to_try)
log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
raise NoTranscriptFound(f"No transcript found for any supported language")
From a129e0954ec7be642d57df816c98bd8a05c99d87 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 16:22:40 +0200
Subject: [PATCH 10/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 67b3715fd..88938d0f2 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -64,6 +64,7 @@ class YoutubeLoader:
self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url
+ # If language is string, convert to list
if isinstance(language, str):
self.language = [language]
else:
@@ -100,7 +101,7 @@ class YoutubeLoader:
except Exception as e:
log.exception("Loading YouTube transcript failed")
return []
-
+
# Make a copy of the language list to avoid modifying the original
languages_to_try = list(self.language)
@@ -129,7 +130,7 @@ class YoutubeLoader:
log.info(f"Error finding transcript for language '{lang}'")
raise e
- # If we get here, all languages failed including the English fallback
+ # If we get here, all languages failed
languages_tried = ", ".join(languages_to_try)
log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
- raise NoTranscriptFound(f"No transcript found for any supported language")
+ raise NoTranscriptFound(f"No transcript found for any supported language. Add additional supported languages and verify whether the video has any transcripts.")
From c69278c13c366777806a6272d83bd0851992c340 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 16:24:27 +0200
Subject: [PATCH 11/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 88938d0f2..17b1fad60 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -64,7 +64,7 @@ class YoutubeLoader:
self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url
- # If language is string, convert to list
+ # Ensure language is a list
if isinstance(language, str):
self.language = [language]
else:
From f65dc715f91ce94750934e457ac14dbebab084e9 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 16:30:18 +0200
Subject: [PATCH 12/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 17b1fad60..7fa0247da 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -83,7 +83,7 @@ class YoutubeLoader:
'Could not import "youtube_transcript_api" Python package. '
"Please install it with `pip install youtube-transcript-api`."
)
-
+
if self.proxy_url:
youtube_proxies = {
"http": self.proxy_url,
@@ -93,7 +93,7 @@ class YoutubeLoader:
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
-
+
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(
self.video_id, proxies=youtube_proxies
@@ -101,11 +101,11 @@ class YoutubeLoader:
except Exception as e:
log.exception("Loading YouTube transcript failed")
return []
-
+
# Make a copy of the language list to avoid modifying the original
languages_to_try = list(self.language)
- # Add English as fallback, if not already in the list
+ # Add English as fallback if not already in the list
if "en" not in languages_to_try:
log.debug("Adding English as fallback language")
languages_to_try.append("en")
@@ -129,8 +129,8 @@ class YoutubeLoader:
except Exception as e:
log.info(f"Error finding transcript for language '{lang}'")
raise e
-
+
# If we get here, all languages failed
languages_tried = ", ".join(languages_to_try)
- log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
- raise NoTranscriptFound(f"No transcript found for any supported language. Add additional supported languages and verify whether the video has any transcripts.")
+ log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
+ raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")
From d7927506f12be656bcc1c452281c8f8733ea7baa Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 17:06:21 +0200
Subject: [PATCH 13/15] Update youtube.py
---
.../open_webui/retrieval/loaders/youtube.py | 28 ++++++++-----------
1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 7fa0247da..1fa2b635c 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -62,13 +62,17 @@ class YoutubeLoader:
_video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id}
- self.language = language
self.proxy_url = proxy_url
+
# Ensure language is a list
if isinstance(language, str):
self.language = [language]
else:
- self.language = language
+ self.language = list(language) # Make a copy to avoid modifying the original
+
+ # Add English as fallback if not already in the list
+ if "en" not in self.language:
+ self.language.append("en")
def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects."""
@@ -83,7 +87,7 @@ class YoutubeLoader:
'Could not import "youtube_transcript_api" Python package. '
"Please install it with `pip install youtube-transcript-api`."
)
-
+
if self.proxy_url:
youtube_proxies = {
"http": self.proxy_url,
@@ -102,16 +106,8 @@ class YoutubeLoader:
log.exception("Loading YouTube transcript failed")
return []
- # Make a copy of the language list to avoid modifying the original
- languages_to_try = list(self.language)
-
- # Add English as fallback if not already in the list
- if "en" not in languages_to_try:
- log.debug("Adding English as fallback language")
- languages_to_try.append("en")
-
# Try each language in order of priority
- for lang in languages_to_try:
+ for lang in self.language:
try:
transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'")
@@ -129,8 +125,8 @@ class YoutubeLoader:
except Exception as e:
log.info(f"Error finding transcript for language '{lang}'")
raise e
-
+
# If we get here, all languages failed
- languages_tried = ", ".join(languages_to_try)
- log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
- raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")
+ languages_tried = ", ".join(self.language)
+ log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
+ raise NoTranscriptFound(f"No transcript found for any supported language")
From 87dcbd198c3aed00e22c11dcc0e591f72126a057 Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 17:11:03 +0200
Subject: [PATCH 14/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 1fa2b635c..70153f8cf 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -68,7 +68,7 @@ class YoutubeLoader:
if isinstance(language, str):
self.language = [language]
else:
- self.language = list(language) # Make a copy to avoid modifying the original
+ self.language = list(language)
# Add English as fallback if not already in the list
if "en" not in self.language:
From 1dcbec71ec054f79f570cd95da5a4031568a63fe Mon Sep 17 00:00:00 2001
From: Classic298 <27028174+Classic298@users.noreply.github.com>
Date: Tue, 6 May 2025 17:14:00 +0200
Subject: [PATCH 15/15] Update youtube.py
---
backend/open_webui/retrieval/loaders/youtube.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py
index 70153f8cf..763d73094 100644
--- a/backend/open_webui/retrieval/loaders/youtube.py
+++ b/backend/open_webui/retrieval/loaders/youtube.py
@@ -128,5 +128,5 @@ class YoutubeLoader:
# If we get here, all languages failed
languages_tried = ", ".join(self.language)
- log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
- raise NoTranscriptFound(f"No transcript found for any supported language")
+ log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
+ raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")