Update youtube.py

This commit is contained in:
Classic298 2025-05-05 20:03:00 +02:00 committed by GitHub
parent 0a3817ed86
commit 1a30b3746e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -88,7 +88,8 @@ class YoutubeLoader:
"http": self.proxy_url, "http": self.proxy_url,
"https": self.proxy_url, "https": self.proxy_url,
} }
log.debug(f"Using proxy URL: {self.proxy_url}...") # Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else: else:
youtube_proxies = None youtube_proxies = None
@ -101,12 +102,10 @@ class YoutubeLoader:
return [] return []
# Try each language in order of priority # Try each language in order of priority
last_exception = None
for lang in self.language: for lang in self.language:
try: try:
transcript = transcript_list.find_transcript([lang]) transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'") log.debug(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join( transcript_text = " ".join(
map( map(
@ -115,18 +114,37 @@ class YoutubeLoader:
) )
) )
return [Document(page_content=transcript_text, metadata=self._metadata)] return [Document(page_content=transcript_text, metadata=self._metadata)]
except NoTranscriptFound as e: except NoTranscriptFound:
log.debug(f"No transcript found for language '{lang}'") log.debug(f"No transcript found for language '{lang}'")
last_exception = e
continue continue
except Exception as e: except Exception as e:
# If we hit any other type of exception, log it and re-raise # If we hit any other type of exception, log it and re-raise
log.exception(f"Error finding transcript for language '{lang}'") log.exception(f"Error finding transcript for language '{lang}'")
raise e raise e
# If all specified languages fail, raise the last exception # If all specified languages fail, fall back to English (unless English was already tried)
if last_exception: if "en" not in self.language:
log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}") try:
raise last_exception log.debug("Falling back to English transcript")
transcript = transcript_list.find_transcript(["en"])
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
map(
lambda transcript_piece: transcript_piece.text.strip(" "),
transcript_pieces,
)
)
return [Document(page_content=transcript_text, metadata=self._metadata)]
except NoTranscriptFound:
log.warning("No English transcript found as fallback")
except Exception as e:
log.exception("Error finding English transcript fallback")
raise e
return [] # If we get here, all languages failed including the English fallback
languages_tried = ", ".join(self.language)
if "en" not in self.language:
languages_tried += ", en (fallback)"
log.warning(f"No transcript found for any of the specified languages: {languages_tried}")
raise NoTranscriptFound(f"No transcript found for any supported language")