mirror of
https://github.com/open-webui/open-webui
synced 2025-06-16 19:31:52 +00:00
Update youtube.py
This commit is contained in:
parent
7680ac2517
commit
0a845db8ec
@ -70,67 +70,67 @@ class YoutubeLoader:
|
|||||||
self.language = language
|
self.language = language
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load YouTube transcripts into `Document` objects."""
|
"""Load YouTube transcripts into `Document` objects."""
|
||||||
try:
|
|
||||||
from youtube_transcript_api import (
|
|
||||||
NoTranscriptFound,
|
|
||||||
TranscriptsDisabled,
|
|
||||||
YouTubeTranscriptApi,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
'Could not import "youtube_transcript_api" Python package. '
|
|
||||||
"Please install it with `pip install youtube-transcript-api`."
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.proxy_url:
|
|
||||||
youtube_proxies = {
|
|
||||||
"http": self.proxy_url,
|
|
||||||
"https": self.proxy_url,
|
|
||||||
}
|
|
||||||
# Don't log complete URL because it might contain secrets
|
|
||||||
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
|
|
||||||
else:
|
|
||||||
youtube_proxies = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
transcript_list = YouTubeTranscriptApi.list_transcripts(
|
|
||||||
self.video_id, proxies=youtube_proxies
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
log.exception("Loading YouTube transcript failed")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Try each language in order of priority
|
|
||||||
last_exception = None
|
|
||||||
for lang in self.language:
|
|
||||||
try:
|
try:
|
||||||
log.debug(f"Attempting to find transcript for language '{lang}'")
|
from youtube_transcript_api import (
|
||||||
transcript = transcript_list.find_transcript([lang])
|
NoTranscriptFound,
|
||||||
log.info(f"Found transcript for language '{lang}'")
|
TranscriptsDisabled,
|
||||||
|
YouTubeTranscriptApi,
|
||||||
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
|
)
|
||||||
transcript_text = " ".join(
|
except ImportError:
|
||||||
map(
|
raise ImportError(
|
||||||
lambda transcript_piece: transcript_piece.text.strip(" "),
|
'Could not import "youtube_transcript_api" Python package. '
|
||||||
transcript_pieces,
|
"Please install it with `pip install youtube-transcript-api`."
|
||||||
)
|
|
||||||
)
|
)
|
||||||
return [Document(page_content=transcript_text, metadata=self._metadata)]
|
|
||||||
except NoTranscriptFound as e:
|
|
||||||
log.debug(f"No transcript found for language '{lang}'")
|
|
||||||
last_exception = e
|
|
||||||
continue
|
|
||||||
except Exception as e:
|
|
||||||
# If we hit any other type of exception, log it and re-raise
|
|
||||||
log.exception(f"Error finding transcript for language '{lang}'")
|
|
||||||
raise e
|
|
||||||
|
|
||||||
# If all specified languages fail, raise the last exception
|
|
||||||
# This maintains compatibility with the error handling in the rest of the application
|
|
||||||
if last_exception:
|
|
||||||
log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
|
|
||||||
raise last_exception
|
|
||||||
|
|
||||||
# This should never happen (we'd have raised an exception above)
|
if self.proxy_url:
|
||||||
return []
|
youtube_proxies = {
|
||||||
|
"http": self.proxy_url,
|
||||||
|
"https": self.proxy_url,
|
||||||
|
}
|
||||||
|
# Don't log complete URL because it might contain secrets
|
||||||
|
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
|
||||||
|
else:
|
||||||
|
youtube_proxies = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
transcript_list = YouTubeTranscriptApi.list_transcripts(
|
||||||
|
self.video_id, proxies=youtube_proxies
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
log.exception("Loading YouTube transcript failed")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Try each language in order of priority
|
||||||
|
last_exception = None
|
||||||
|
for lang in self.language:
|
||||||
|
try:
|
||||||
|
log.debug(f"Attempting to find transcript for language '{lang}'")
|
||||||
|
transcript = transcript_list.find_transcript([lang])
|
||||||
|
log.info(f"Found transcript for language '{lang}'")
|
||||||
|
|
||||||
|
transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
|
||||||
|
transcript_text = " ".join(
|
||||||
|
map(
|
||||||
|
lambda transcript_piece: transcript_piece.text.strip(" "),
|
||||||
|
transcript_pieces,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return [Document(page_content=transcript_text, metadata=self._metadata)]
|
||||||
|
except NoTranscriptFound as e:
|
||||||
|
log.debug(f"No transcript found for language '{lang}'")
|
||||||
|
last_exception = e
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
# If we hit any other type of exception, log it and re-raise
|
||||||
|
log.exception(f"Error finding transcript for language '{lang}'")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# If all specified languages fail, raise the last exception
|
||||||
|
# This maintains compatibility with the error handling in the rest of the application
|
||||||
|
if last_exception:
|
||||||
|
log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
|
||||||
|
raise last_exception
|
||||||
|
|
||||||
|
# This should never happen (we'd have raised an exception above)
|
||||||
|
return []
|
||||||
|
Loading…
Reference in New Issue
Block a user