[feat] Allow use of proxy for downloading Youtube transscripts

This commit is contained in:
Thomas Rehn
2024-11-27 15:09:33 +01:00
parent 0a26c41c7b
commit 53296c1005
5 changed files with 54 additions and 4 deletions

View File

@@ -1,7 +1,12 @@
import logging
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse
from langchain_core.documents import Document
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
ALLOWED_SCHEMES = {"http", "https"}
ALLOWED_NETLOCS = {
@@ -51,12 +56,14 @@ class YoutubeLoader:
self,
video_id: str,
language: Union[str, Sequence[str]] = "en",
proxy_url: Optional[str] = None,
):
"""Initialize with YouTube video ID."""
_video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url
if isinstance(language, str):
self.language = [language]
else:
@@ -76,10 +83,20 @@ class YoutubeLoader:
"Please install it with `pip install youtube-transcript-api`."
)
if self.proxy_url:
youtube_proxies = {
'http': self.proxy_url,
'https': self.proxy_url,
}
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id, proxies=youtube_proxies)
except Exception as e:
print(e)
log.exception("Loading YouTube transcript failed")
return []
try: