Merge pull request #14781 from lucyknada/patch-2

fix: fix #14752 and add manual transcription retrieval
This commit is contained in:
Tim Jaeryang Baek 2025-06-08 18:40:28 +04:00 committed by GitHub
commit 50d9a2ac58
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
import logging import logging
from xml.etree.ElementTree import ParseError
from typing import Any, Dict, Generator, List, Optional, Sequence, Union from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
@ -93,7 +94,6 @@ class YoutubeLoader:
"http": self.proxy_url, "http": self.proxy_url,
"https": self.proxy_url, "https": self.proxy_url,
} }
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else: else:
youtube_proxies = None youtube_proxies = None
@ -110,11 +110,35 @@ class YoutubeLoader:
for lang in self.language: for lang in self.language:
try: try:
transcript = transcript_list.find_transcript([lang]) transcript = transcript_list.find_transcript([lang])
if transcript.is_generated:
log.debug(f"Found generated transcript for language '{lang}'")
try:
transcript = transcript_list.find_manually_created_transcript(
[lang]
)
log.debug(f"Found manual transcript for language '{lang}'")
except NoTranscriptFound:
log.debug(
f"No manual transcript found for language '{lang}', using generated"
)
pass
log.debug(f"Found transcript for language '{lang}'") log.debug(f"Found transcript for language '{lang}'")
try:
transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
except ParseError:
log.debug(f"Empty or invalid transcript for language '{lang}'")
continue
if not transcript_pieces:
log.debug(f"Empty transcript for language '{lang}'")
continue
transcript_text = " ".join( transcript_text = " ".join(
map( map(
lambda transcript_piece: transcript_piece.text.strip(" "), lambda transcript_piece: transcript_piece.text.strip(" ")
if hasattr(transcript_piece, "text")
else "",
transcript_pieces, transcript_pieces,
) )
) )
@ -131,6 +155,4 @@ class YoutubeLoader:
log.warning( log.warning(
f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed." f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed."
) )
raise NoTranscriptFound( raise NoTranscriptFound(self.video_id, self.language, list(transcript_list))
f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed."
)