diff --git a/backend/open_webui/apps/retrieval/loaders/youtube.py b/backend/open_webui/apps/retrieval/loaders/youtube.py index ad1088be0..aa3be51f2 100644 --- a/backend/open_webui/apps/retrieval/loaders/youtube.py +++ b/backend/open_webui/apps/retrieval/loaders/youtube.py @@ -1,7 +1,12 @@ +import logging + from typing import Any, Dict, Generator, List, Optional, Sequence, Union from urllib.parse import parse_qs, urlparse from langchain_core.documents import Document +from open_webui.env import SRC_LOG_LEVELS +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) ALLOWED_SCHEMES = {"http", "https"} ALLOWED_NETLOCS = { @@ -51,12 +56,14 @@ class YoutubeLoader: self, video_id: str, language: Union[str, Sequence[str]] = "en", + proxy_url: Optional[str] = None, ): """Initialize with YouTube video ID.""" _video_id = _parse_video_id(video_id) self.video_id = _video_id if _video_id is not None else video_id self._metadata = {"source": video_id} self.language = language + self.proxy_url = proxy_url if isinstance(language, str): self.language = [language] else: @@ -76,10 +83,20 @@ class YoutubeLoader: "Please install it with `pip install youtube-transcript-api`." ) + if self.proxy_url: + youtube_proxies = { + 'http': self.proxy_url, + 'https': self.proxy_url, + } + # Don't log complete URL because it might contain secrets + log.debug(f"Using proxy URL: {self.proxy_url[:14]}...") + else: + youtube_proxies = None + try: - transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id) + transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id, proxies=youtube_proxies) except Exception as e: - print(e) + log.exception("Loading YouTube transcript failed") return [] try: diff --git a/backend/open_webui/apps/retrieval/main.py b/backend/open_webui/apps/retrieval/main.py index 63bc18190..801760a65 100644 --- a/backend/open_webui/apps/retrieval/main.py +++ b/backend/open_webui/apps/retrieval/main.py @@ -105,6 +105,7 @@ from open_webui.config import ( TIKA_SERVER_URL, UPLOAD_DIR, YOUTUBE_LOADER_LANGUAGE, + YOUTUBE_LOADER_PROXY_URL, DEFAULT_LOCALE, AppConfig, ) @@ -171,6 +172,7 @@ app.state.config.OLLAMA_API_KEY = RAG_OLLAMA_API_KEY app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE +app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL app.state.YOUTUBE_LOADER_TRANSLATION = None @@ -471,6 +473,7 @@ async def get_rag_config(user=Depends(get_admin_user)): "youtube": { "language": app.state.config.YOUTUBE_LOADER_LANGUAGE, "translation": app.state.YOUTUBE_LOADER_TRANSLATION, + "proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL, }, "web": { "web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, @@ -518,6 +521,7 @@ class ChunkParamUpdateForm(BaseModel): class YoutubeLoaderConfig(BaseModel): language: list[str] translation: Optional[str] = None + proxy_url: str = "" class WebSearchConfig(BaseModel): @@ -580,6 +584,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_ if form_data.youtube is not None: app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language + app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.youtube.proxy_url app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation if form_data.web is not None: @@ -640,6 +645,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_ }, "youtube": { "language": app.state.config.YOUTUBE_LOADER_LANGUAGE, + "proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL, "translation": app.state.YOUTUBE_LOADER_TRANSLATION, }, "web": { @@ -1081,7 +1087,9 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u collection_name = calculate_sha256_string(form_data.url)[:63] loader = YoutubeLoader( - form_data.url, language=app.state.config.YOUTUBE_LOADER_LANGUAGE + form_data.url, + language=app.state.config.YOUTUBE_LOADER_LANGUAGE, + proxy_url=app.state.config.YOUTUBE_LOADER_PROXY_URL, ) docs = loader.load() diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 70f8b0287..00d80324d 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1305,6 +1305,12 @@ YOUTUBE_LOADER_LANGUAGE = PersistentConfig( os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","), ) +YOUTUBE_LOADER_PROXY_URL = PersistentConfig( + "YOUTUBE_LOADER_PROXY_URL", + "rag.youtube_loader_proxy_url", + os.getenv("YOUTUBE_LOADER_PROXY_URL", ""), +) + ENABLE_RAG_WEB_SEARCH = PersistentConfig( "ENABLE_RAG_WEB_SEARCH", diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts index 6c6b18b9f..21ae792fa 100644 --- a/src/lib/apis/retrieval/index.ts +++ b/src/lib/apis/retrieval/index.ts @@ -40,6 +40,7 @@ type ContentExtractConfigForm = { type YoutubeConfigForm = { language: string[]; translation?: string | null; + proxy_url: string; }; type RAGConfigForm = { diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index d8b1a33d1..a3ccbec1d 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -29,13 +29,15 @@ let youtubeLanguage = 'en'; let youtubeTranslation = null; + let youtubeProxyUrl = ''; const submitHandler = async () => { const res = await updateRAGConfig(localStorage.token, { web: webConfig, youtube: { language: youtubeLanguage.split(',').map((lang) => lang.trim()), - translation: youtubeTranslation + translation: youtubeTranslation, + proxy_url: youtubeProxyUrl } }); }; @@ -48,6 +50,7 @@ youtubeLanguage = res.youtube.language.join(','); youtubeTranslation = res.youtube.translation; + youtubeProxyUrl = res.youtube.proxy_url; } }); @@ -358,6 +361,21 @@ + +