Update youtube.py

This commit is contained in:
Classic298 2025-05-05 19:57:21 +02:00 committed by GitHub
parent 7680ac2517
commit 0a845db8ec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -70,67 +70,67 @@ class YoutubeLoader:
self.language = language
def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects."""
try:
from youtube_transcript_api import (
NoTranscriptFound,
TranscriptsDisabled,
YouTubeTranscriptApi,
)
except ImportError:
raise ImportError(
'Could not import "youtube_transcript_api" Python package. '
"Please install it with `pip install youtube-transcript-api`."
)
if self.proxy_url:
youtube_proxies = {
"http": self.proxy_url,
"https": self.proxy_url,
}
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(
self.video_id, proxies=youtube_proxies
)
except Exception as e:
log.exception("Loading YouTube transcript failed")
return []
# Try each language in order of priority
last_exception = None
for lang in self.language:
"""Load YouTube transcripts into `Document` objects."""
try:
log.debug(f"Attempting to find transcript for language '{lang}'")
transcript = transcript_list.find_transcript([lang])
log.info(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
map(
lambda transcript_piece: transcript_piece.text.strip(" "),
transcript_pieces,
)
from youtube_transcript_api import (
NoTranscriptFound,
TranscriptsDisabled,
YouTubeTranscriptApi,
)
except ImportError:
raise ImportError(
'Could not import "youtube_transcript_api" Python package. '
"Please install it with `pip install youtube-transcript-api`."
)
return [Document(page_content=transcript_text, metadata=self._metadata)]
except NoTranscriptFound as e:
log.debug(f"No transcript found for language '{lang}'")
last_exception = e
continue
except Exception as e:
# If we hit any other type of exception, log it and re-raise
log.exception(f"Error finding transcript for language '{lang}'")
raise e
# If all specified languages fail, raise the last exception
# This maintains compatibility with the error handling in the rest of the application
if last_exception:
log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
raise last_exception
# This should never happen (we'd have raised an exception above)
return []
if self.proxy_url:
youtube_proxies = {
"http": self.proxy_url,
"https": self.proxy_url,
}
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(
self.video_id, proxies=youtube_proxies
)
except Exception as e:
log.exception("Loading YouTube transcript failed")
return []
# Try each language in order of priority
last_exception = None
for lang in self.language:
try:
log.debug(f"Attempting to find transcript for language '{lang}'")
transcript = transcript_list.find_transcript([lang])
log.info(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
map(
lambda transcript_piece: transcript_piece.text.strip(" "),
transcript_pieces,
)
)
return [Document(page_content=transcript_text, metadata=self._metadata)]
except NoTranscriptFound as e:
log.debug(f"No transcript found for language '{lang}'")
last_exception = e
continue
except Exception as e:
# If we hit any other type of exception, log it and re-raise
log.exception(f"Error finding transcript for language '{lang}'")
raise e
# If all specified languages fail, raise the last exception
# This maintains compatibility with the error handling in the rest of the application
if last_exception:
log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
raise last_exception
# This should never happen (we'd have raised an exception above)
return []