mirror of
https://github.com/open-webui/open-webui
synced 2025-06-26 18:26:48 +00:00
Merge pull request #13528 from Classic298/dev
feat: Enhance YouTube Transcription Loader for multi-language support
This commit is contained in:
commit
ea07e242f5
@ -62,12 +62,17 @@ class YoutubeLoader:
|
||||
_video_id = _parse_video_id(video_id)
|
||||
self.video_id = _video_id if _video_id is not None else video_id
|
||||
self._metadata = {"source": video_id}
|
||||
self.language = language
|
||||
self.proxy_url = proxy_url
|
||||
|
||||
# Ensure language is a list
|
||||
if isinstance(language, str):
|
||||
self.language = [language]
|
||||
else:
|
||||
self.language = language
|
||||
self.language = list(language)
|
||||
|
||||
# Add English as fallback if not already in the list
|
||||
if "en" not in self.language:
|
||||
self.language.append("en")
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load YouTube transcripts into `Document` objects."""
|
||||
@ -82,7 +87,7 @@ class YoutubeLoader:
|
||||
'Could not import "youtube_transcript_api" Python package. '
|
||||
"Please install it with `pip install youtube-transcript-api`."
|
||||
)
|
||||
|
||||
|
||||
if self.proxy_url:
|
||||
youtube_proxies = {
|
||||
"http": self.proxy_url,
|
||||
@ -92,7 +97,7 @@ class YoutubeLoader:
|
||||
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
|
||||
else:
|
||||
youtube_proxies = None
|
||||
|
||||
|
||||
try:
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(
|
||||
self.video_id, proxies=youtube_proxies
|
||||
@ -100,18 +105,28 @@ class YoutubeLoader:
|
||||
except Exception as e:
|
||||
log.exception("Loading YouTube transcript failed")
|
||||
return []
|
||||
|
||||
try:
|
||||
transcript = transcript_list.find_transcript(self.language)
|
||||
except NoTranscriptFound:
|
||||
transcript = transcript_list.find_transcript(["en"])
|
||||
|
||||
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
|
||||
|
||||
transcript = " ".join(
|
||||
map(
|
||||
lambda transcript_piece: transcript_piece.text.strip(" "),
|
||||
transcript_pieces,
|
||||
)
|
||||
)
|
||||
return [Document(page_content=transcript, metadata=self._metadata)]
|
||||
|
||||
# Try each language in order of priority
|
||||
for lang in self.language:
|
||||
try:
|
||||
transcript = transcript_list.find_transcript([lang])
|
||||
log.debug(f"Found transcript for language '{lang}'")
|
||||
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
|
||||
transcript_text = " ".join(
|
||||
map(
|
||||
lambda transcript_piece: transcript_piece.text.strip(" "),
|
||||
transcript_pieces,
|
||||
)
|
||||
)
|
||||
return [Document(page_content=transcript_text, metadata=self._metadata)]
|
||||
except NoTranscriptFound:
|
||||
log.debug(f"No transcript found for language '{lang}'")
|
||||
continue
|
||||
except Exception as e:
|
||||
log.info(f"Error finding transcript for language '{lang}'")
|
||||
raise e
|
||||
|
||||
# If we get here, all languages failed
|
||||
languages_tried = ", ".join(self.language)
|
||||
log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
|
||||
raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")
|
||||
|
Loading…
Reference in New Issue
Block a user