Update youtube.py

This commit is contained in:
Classic298 2025-05-06 17:06:21 +02:00 committed by GitHub
parent f65dc715f9
commit d7927506f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -62,13 +62,17 @@ class YoutubeLoader:
_video_id = _parse_video_id(video_id) _video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id} self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url self.proxy_url = proxy_url
# Ensure language is a list # Ensure language is a list
if isinstance(language, str): if isinstance(language, str):
self.language = [language] self.language = [language]
else: else:
self.language = language self.language = list(language) # Make a copy to avoid modifying the original
# Add English as fallback if not already in the list
if "en" not in self.language:
self.language.append("en")
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects.""" """Load YouTube transcripts into `Document` objects."""
@ -102,16 +106,8 @@ class YoutubeLoader:
log.exception("Loading YouTube transcript failed") log.exception("Loading YouTube transcript failed")
return [] return []
# Make a copy of the language list to avoid modifying the original
languages_to_try = list(self.language)
# Add English as fallback if not already in the list
if "en" not in languages_to_try:
log.debug("Adding English as fallback language")
languages_to_try.append("en")
# Try each language in order of priority # Try each language in order of priority
for lang in languages_to_try: for lang in self.language:
try: try:
transcript = transcript_list.find_transcript([lang]) transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'") log.debug(f"Found transcript for language '{lang}'")
@ -131,6 +127,6 @@ class YoutubeLoader:
raise e raise e
# If we get here, all languages failed # If we get here, all languages failed
languages_tried = ", ".join(languages_to_try) languages_tried = ", ".join(self.language)
log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.") log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.") raise NoTranscriptFound(f"No transcript found for any supported language")