fix: improve YouTube transcript handling for manual captions

This commit is contained in:
Juanan Pereira 2025-01-09 09:36:24 +01:00
parent 49f462163a
commit f0f7a56c48
6 changed files with 40 additions and 19 deletions

View File

@ -137,13 +137,30 @@ class YoutubeLoader:
return [] return []
try: try:
# First try to get transcript in requested language # First try to get manual transcript in requested language
transcript = transcript_list.find_transcript(self.language) for lang in self.language:
except NoTranscriptFound: try:
# Fallback: try to get any available transcript available_transcripts = (
available_transcripts = list( transcript_list._manually_created_transcripts
transcript_list._generated_transcripts.values()
) )
if lang in available_transcripts:
transcript = available_transcripts[lang]
log.info(f"Found manual transcript in language: {lang}")
break
except NoTranscriptFound:
continue
else:
# If no manual transcript found, try auto-generated ones
try:
transcript = transcript_list.find_transcript(self.language)
log.info(
f"Using auto-generated transcript in language: {transcript.language_code}"
)
except NoTranscriptFound:
# Final fallback: try to get any available transcript
available_transcripts = list(
transcript_list._manually_created_transcripts.values()
) + list(transcript_list._generated_transcripts.values())
if available_transcripts: if available_transcripts:
transcript = available_transcripts[0] transcript = available_transcripts[0]
log.info( log.info(
@ -153,6 +170,10 @@ class YoutubeLoader:
log.error("No transcripts found for video") log.error("No transcripts found for video")
return [] return []
except Exception as e:
log.exception(f"Error fetching transcript: {str(e)}")
return []
transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
# Get video title and add it to base metadata # Get video title and add it to base metadata