[feat] Allow use of proxy for downloading Youtube transscripts

This commit is contained in:
Thomas Rehn 2024-11-27 15:09:33 +01:00
parent 0a26c41c7b
commit 53296c1005
5 changed files with 54 additions and 4 deletions

View File

@ -1,7 +1,12 @@
import logging
from typing import Any, Dict, Generator, List, Optional, Sequence, Union from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from langchain_core.documents import Document from langchain_core.documents import Document
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
ALLOWED_SCHEMES = {"http", "https"} ALLOWED_SCHEMES = {"http", "https"}
ALLOWED_NETLOCS = { ALLOWED_NETLOCS = {
@ -51,12 +56,14 @@ class YoutubeLoader:
self, self,
video_id: str, video_id: str,
language: Union[str, Sequence[str]] = "en", language: Union[str, Sequence[str]] = "en",
proxy_url: Optional[str] = None,
): ):
"""Initialize with YouTube video ID.""" """Initialize with YouTube video ID."""
_video_id = _parse_video_id(video_id) _video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id} self._metadata = {"source": video_id}
self.language = language self.language = language
self.proxy_url = proxy_url
if isinstance(language, str): if isinstance(language, str):
self.language = [language] self.language = [language]
else: else:
@ -76,10 +83,20 @@ class YoutubeLoader:
"Please install it with `pip install youtube-transcript-api`." "Please install it with `pip install youtube-transcript-api`."
) )
if self.proxy_url:
youtube_proxies = {
'http': self.proxy_url,
'https': self.proxy_url,
}
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
try: try:
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id) transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id, proxies=youtube_proxies)
except Exception as e: except Exception as e:
print(e) log.exception("Loading YouTube transcript failed")
return [] return []
try: try:

View File

@ -105,6 +105,7 @@ from open_webui.config import (
TIKA_SERVER_URL, TIKA_SERVER_URL,
UPLOAD_DIR, UPLOAD_DIR,
YOUTUBE_LOADER_LANGUAGE, YOUTUBE_LOADER_LANGUAGE,
YOUTUBE_LOADER_PROXY_URL,
DEFAULT_LOCALE, DEFAULT_LOCALE,
AppConfig, AppConfig,
) )
@ -171,6 +172,7 @@ app.state.config.OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
app.state.YOUTUBE_LOADER_TRANSLATION = None app.state.YOUTUBE_LOADER_TRANSLATION = None
@ -471,6 +473,7 @@ async def get_rag_config(user=Depends(get_admin_user)):
"youtube": { "youtube": {
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE, "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
"translation": app.state.YOUTUBE_LOADER_TRANSLATION, "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
"proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
}, },
"web": { "web": {
"web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, "web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
@ -518,6 +521,7 @@ class ChunkParamUpdateForm(BaseModel):
class YoutubeLoaderConfig(BaseModel): class YoutubeLoaderConfig(BaseModel):
language: list[str] language: list[str]
translation: Optional[str] = None translation: Optional[str] = None
proxy_url: str = ""
class WebSearchConfig(BaseModel): class WebSearchConfig(BaseModel):
@ -580,6 +584,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
if form_data.youtube is not None: if form_data.youtube is not None:
app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language
app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.youtube.proxy_url
app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation
if form_data.web is not None: if form_data.web is not None:
@ -640,6 +645,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
}, },
"youtube": { "youtube": {
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE, "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
"proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
"translation": app.state.YOUTUBE_LOADER_TRANSLATION, "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
}, },
"web": { "web": {
@ -1081,7 +1087,9 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
collection_name = calculate_sha256_string(form_data.url)[:63] collection_name = calculate_sha256_string(form_data.url)[:63]
loader = YoutubeLoader( loader = YoutubeLoader(
form_data.url, language=app.state.config.YOUTUBE_LOADER_LANGUAGE form_data.url,
language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
proxy_url=app.state.config.YOUTUBE_LOADER_PROXY_URL,
) )
docs = loader.load() docs = loader.load()

View File

@ -1259,6 +1259,12 @@ YOUTUBE_LOADER_LANGUAGE = PersistentConfig(
os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","), os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","),
) )
YOUTUBE_LOADER_PROXY_URL = PersistentConfig(
"YOUTUBE_LOADER_PROXY_URL",
"rag.youtube_loader_proxy_url",
os.getenv("YOUTUBE_LOADER_PROXY_URL", ""),
)
ENABLE_RAG_WEB_SEARCH = PersistentConfig( ENABLE_RAG_WEB_SEARCH = PersistentConfig(
"ENABLE_RAG_WEB_SEARCH", "ENABLE_RAG_WEB_SEARCH",

View File

@ -40,6 +40,7 @@ type ContentExtractConfigForm = {
type YoutubeConfigForm = { type YoutubeConfigForm = {
language: string[]; language: string[];
translation?: string | null; translation?: string | null;
proxy_url: string;
}; };
type RAGConfigForm = { type RAGConfigForm = {

View File

@ -29,13 +29,15 @@
let youtubeLanguage = 'en'; let youtubeLanguage = 'en';
let youtubeTranslation = null; let youtubeTranslation = null;
let youtubeProxyUrl = '';
const submitHandler = async () => { const submitHandler = async () => {
const res = await updateRAGConfig(localStorage.token, { const res = await updateRAGConfig(localStorage.token, {
web: webConfig, web: webConfig,
youtube: { youtube: {
language: youtubeLanguage.split(',').map((lang) => lang.trim()), language: youtubeLanguage.split(',').map((lang) => lang.trim()),
translation: youtubeTranslation translation: youtubeTranslation,
proxy_url: youtubeProxyUrl
} }
}); });
}; };
@ -48,6 +50,7 @@
youtubeLanguage = res.youtube.language.join(','); youtubeLanguage = res.youtube.language.join(',');
youtubeTranslation = res.youtube.translation; youtubeTranslation = res.youtube.translation;
youtubeProxyUrl = res.youtube.proxy_url;
} }
}); });
</script> </script>
@ -358,6 +361,21 @@
</div> </div>
</div> </div>
</div> </div>
<div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" w-20 text-xs font-medium self-center">{$i18n.t('Proxy URL')}</div>
<div class=" flex-1 self-center">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
type="text"
placeholder={$i18n.t('Enter proxy URL (e.g. https://user:password@host:port)')}
bind:value={youtubeProxyUrl}
autocomplete="off"
/>
</div>
</div>
</div>
</div> </div>
{/if} {/if}
</div> </div>