Merge pull request #14781 from lucyknada/patch-2

fix: fix #14752 and add manual transcription retrieval
This commit is contained in:
Tim Jaeryang Baek 2025-06-08 18:40:28 +04:00 committed by GitHub
commit 50d9a2ac58
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
import logging
from xml.etree.ElementTree import ParseError
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse
@ -93,7 +94,6 @@ class YoutubeLoader:
"http": self.proxy_url,
"https": self.proxy_url,
}
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
@ -110,11 +110,35 @@ class YoutubeLoader:
for lang in self.language:
try:
transcript = transcript_list.find_transcript([lang])
if transcript.is_generated:
log.debug(f"Found generated transcript for language '{lang}'")
try:
transcript = transcript_list.find_manually_created_transcript(
[lang]
)
log.debug(f"Found manual transcript for language '{lang}'")
except NoTranscriptFound:
log.debug(
f"No manual transcript found for language '{lang}', using generated"
)
pass
log.debug(f"Found transcript for language '{lang}'")
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
try:
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
except ParseError:
log.debug(f"Empty or invalid transcript for language '{lang}'")
continue
if not transcript_pieces:
log.debug(f"Empty transcript for language '{lang}'")
continue
transcript_text = " ".join(
map(
lambda transcript_piece: transcript_piece.text.strip(" "),
lambda transcript_piece: transcript_piece.text.strip(" ")
if hasattr(transcript_piece, "text")
else "",
transcript_pieces,
)
)
@ -131,6 +155,4 @@ class YoutubeLoader:
log.warning(
f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed."
)
raise NoTranscriptFound(
f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed."
)
raise NoTranscriptFound(self.video_id, self.language, list(transcript_list))