fixes #14752 and adds manual transcription option

This commit is contained in:
lucy 2025-06-08 14:26:24 +02:00 committed by GitHub
parent 53764fe648
commit b0965a8184
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
import logging import logging
from xml.etree.ElementTree import ParseError
from typing import Any, Dict, Generator, List, Optional, Sequence, Union from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
@ -93,7 +94,6 @@ class YoutubeLoader:
"http": self.proxy_url, "http": self.proxy_url,
"https": self.proxy_url, "https": self.proxy_url,
} }
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else: else:
youtube_proxies = None youtube_proxies = None
@ -110,11 +110,35 @@ class YoutubeLoader:
for lang in self.language: for lang in self.language:
try: try:
transcript = transcript_list.find_transcript([lang]) transcript = transcript_list.find_transcript([lang])
if transcript.is_generated:
log.debug(f"Found generated transcript for language '{lang}'")
try:
transcript = transcript_list.find_manually_created_transcript(
[lang]
)
log.debug(f"Found manual transcript for language '{lang}'")
except NoTranscriptFound:
log.debug(
f"No manual transcript found for language '{lang}', using generated"
)
pass
log.debug(f"Found transcript for language '{lang}'") log.debug(f"Found transcript for language '{lang}'")
try:
transcript_pieces: List[Dict[str, Any]] = transcript.fetch() transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
except ParseError:
log.debug(f"Empty or invalid transcript for language '{lang}'")
continue
if not transcript_pieces:
log.debug(f"Empty transcript for language '{lang}'")
continue
transcript_text = " ".join( transcript_text = " ".join(
map( map(
lambda transcript_piece: transcript_piece.text.strip(" "), lambda transcript_piece: transcript_piece.text.strip(" ")
if hasattr(transcript_piece, "text")
else "",
transcript_pieces, transcript_pieces,
) )
) )
@ -131,6 +155,4 @@ class YoutubeLoader:
log.warning( log.warning(
f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed." f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed."
) )
raise NoTranscriptFound( raise NoTranscriptFound(self.video_id, self.language, list(transcript_list))
f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed."
)