Merge pull request #13528 from Classic298/dev

feat: Enhance YouTube Transcription Loader for multi-language support
This commit is contained in:
Tim Jaeryang Baek 2025-05-07 00:44:45 +04:00 committed by GitHub
commit ea07e242f5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -62,12 +62,17 @@ class YoutubeLoader:
_video_id = _parse_video_id(video_id) _video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id} self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url self.proxy_url = proxy_url
# Ensure language is a list
if isinstance(language, str): if isinstance(language, str):
self.language = [language] self.language = [language]
else: else:
self.language = language self.language = list(language)
# Add English as fallback if not already in the list
if "en" not in self.language:
self.language.append("en")
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load YouTube transcripts into `Document` objects.""" """Load YouTube transcripts into `Document` objects."""
@ -101,17 +106,27 @@ class YoutubeLoader:
log.exception("Loading YouTube transcript failed") log.exception("Loading YouTube transcript failed")
return [] return []
try: # Try each language in order of priority
transcript = transcript_list.find_transcript(self.language) for lang in self.language:
except NoTranscriptFound: try:
transcript = transcript_list.find_transcript(["en"]) transcript = transcript_list.find_transcript([lang])
log.debug(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
transcript_text = " ".join(
map(
lambda transcript_piece: transcript_piece.text.strip(" "),
transcript_pieces,
)
)
return [Document(page_content=transcript_text, metadata=self._metadata)]
except NoTranscriptFound:
log.debug(f"No transcript found for language '{lang}'")
continue
except Exception as e:
log.info(f"Error finding transcript for language '{lang}'")
raise e
transcript_pieces: List[Dict[str, Any]] = transcript.fetch() # If we get here, all languages failed
languages_tried = ", ".join(self.language)
transcript = " ".join( log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
map( raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")
lambda transcript_piece: transcript_piece.text.strip(" "),
transcript_pieces,
)
)
return [Document(page_content=transcript, metadata=self._metadata)]