Merge pull request #13528 from Classic298/dev

feat: Enhance YouTube Transcription Loader for multi-language support
This commit is contained in:
Tim Jaeryang Baek 2025-05-07 00:44:45 +04:00 committed by GitHub
commit ea07e242f5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -62,12 +62,17 @@ class YoutubeLoader:
_video_id = _parse_video_id(video_id) _video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id} self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url self.proxy_url = proxy_url
# Ensure language is a list
if isinstance(language, str): if isinstance(language, str):
self.language = [language] self.language = [language]
else: else:
self.language = language self.language = list(language)
# Add English as fallback if not already in the list
if "en" not in self.language:
self.language.append("en")
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects.""" """Load YouTube transcripts into `Document` objects."""
@ -82,7 +87,7 @@ class YoutubeLoader:
'Could not import "youtube_transcript_api" Python package. ' 'Could not import "youtube_transcript_api" Python package. '
"Please install it with `pip install youtube-transcript-api`." "Please install it with `pip install youtube-transcript-api`."
) )
if self.proxy_url: if self.proxy_url:
youtube_proxies = { youtube_proxies = {
"http": self.proxy_url, "http": self.proxy_url,
@ -92,7 +97,7 @@ class YoutubeLoader:
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else: else:
youtube_proxies = None youtube_proxies = None
try: try:
transcript_list = YouTubeTranscriptApi.list_transcripts( transcript_list = YouTubeTranscriptApi.list_transcripts(
self.video_id, proxies=youtube_proxies self.video_id, proxies=youtube_proxies
@ -100,18 +105,28 @@ class YoutubeLoader:
except Exception as e: except Exception as e:
log.exception("Loading YouTube transcript failed") log.exception("Loading YouTube transcript failed")
return [] return []
try: # Try each language in order of priority
transcript = transcript_list.find_transcript(self.language) for lang in self.language:
except NoTranscriptFound: try:
transcript = transcript_list.find_transcript(["en"]) transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
transcript = " ".join( map(
map( lambda transcript_piece: transcript_piece.text.strip(" "),
lambda transcript_piece: transcript_piece.text.strip(" "), transcript_pieces,
transcript_pieces, )
) )
) return [Document(page_content=transcript_text, metadata=self._metadata)]
return [Document(page_content=transcript, metadata=self._metadata)] except NoTranscriptFound:
log.debug(f"No transcript found for language '{lang}'")
continue
except Exception as e:
log.info(f"Error finding transcript for language '{lang}'")
raise e
# If we get here, all languages failed
languages_tried = ", ".join(self.language)
log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")