Merge pull request #7422 from alpha-pet/feat-youtube-transscript-proxy

feat: Optional proxy setting for downloading Youtube transscripts
This commit is contained in:
Timothy Jaeryang Baek 2024-11-29 12:40:46 -08:00 committed by GitHub
commit 9f981db0b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 54 additions and 4 deletions

View File

@ -1,7 +1,12 @@
import logging
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse
from langchain_core.documents import Document
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
ALLOWED_SCHEMES = {"http", "https"}
ALLOWED_NETLOCS = {
@ -51,12 +56,14 @@ class YoutubeLoader:
self,
video_id: str,
language: Union[str, Sequence[str]] = "en",
proxy_url: Optional[str] = None,
):
"""Initialize with YouTube video ID."""
_video_id = _parse_video_id(video_id)
self.video_id = _video_id if _video_id is not None else video_id
self._metadata = {"source": video_id}
self.language = language
self.proxy_url = proxy_url
if isinstance(language, str):
self.language = [language]
else:
@ -76,10 +83,20 @@ class YoutubeLoader:
"Please install it with `pip install youtube-transcript-api`."
)
if self.proxy_url:
youtube_proxies = {
'http': self.proxy_url,
'https': self.proxy_url,
}
# Don't log complete URL because it might contain secrets
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
else:
youtube_proxies = None
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id, proxies=youtube_proxies)
except Exception as e:
print(e)
log.exception("Loading YouTube transcript failed")
return []
try:

View File

@ -105,6 +105,7 @@ from open_webui.config import (
TIKA_SERVER_URL,
UPLOAD_DIR,
YOUTUBE_LOADER_LANGUAGE,
YOUTUBE_LOADER_PROXY_URL,
DEFAULT_LOCALE,
AppConfig,
)
@ -171,6 +172,7 @@ app.state.config.OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
app.state.YOUTUBE_LOADER_TRANSLATION = None
@ -471,6 +473,7 @@ async def get_rag_config(user=Depends(get_admin_user)):
"youtube": {
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
"proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
},
"web": {
"web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
@ -518,6 +521,7 @@ class ChunkParamUpdateForm(BaseModel):
class YoutubeLoaderConfig(BaseModel):
language: list[str]
translation: Optional[str] = None
proxy_url: str = ""
class WebSearchConfig(BaseModel):
@ -580,6 +584,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
if form_data.youtube is not None:
app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language
app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.youtube.proxy_url
app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation
if form_data.web is not None:
@ -640,6 +645,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
},
"youtube": {
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
"proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
},
"web": {
@ -1081,7 +1087,9 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
collection_name = calculate_sha256_string(form_data.url)[:63]
loader = YoutubeLoader(
form_data.url, language=app.state.config.YOUTUBE_LOADER_LANGUAGE
form_data.url,
language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
proxy_url=app.state.config.YOUTUBE_LOADER_PROXY_URL,
)
docs = loader.load()

View File

@ -1305,6 +1305,12 @@ YOUTUBE_LOADER_LANGUAGE = PersistentConfig(
os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","),
)
YOUTUBE_LOADER_PROXY_URL = PersistentConfig(
"YOUTUBE_LOADER_PROXY_URL",
"rag.youtube_loader_proxy_url",
os.getenv("YOUTUBE_LOADER_PROXY_URL", ""),
)
ENABLE_RAG_WEB_SEARCH = PersistentConfig(
"ENABLE_RAG_WEB_SEARCH",

View File

@ -40,6 +40,7 @@ type ContentExtractConfigForm = {
type YoutubeConfigForm = {
language: string[];
translation?: string | null;
proxy_url: string;
};
type RAGConfigForm = {

View File

@ -29,13 +29,15 @@
let youtubeLanguage = 'en';
let youtubeTranslation = null;
let youtubeProxyUrl = '';
const submitHandler = async () => {
const res = await updateRAGConfig(localStorage.token, {
web: webConfig,
youtube: {
language: youtubeLanguage.split(',').map((lang) => lang.trim()),
translation: youtubeTranslation
translation: youtubeTranslation,
proxy_url: youtubeProxyUrl
}
});
};
@ -48,6 +50,7 @@
youtubeLanguage = res.youtube.language.join(',');
youtubeTranslation = res.youtube.translation;
youtubeProxyUrl = res.youtube.proxy_url;
}
});
</script>
@ -358,6 +361,21 @@
</div>
</div>
</div>
<div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" w-20 text-xs font-medium self-center">{$i18n.t('Proxy URL')}</div>
<div class=" flex-1 self-center">
<input
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
type="text"
placeholder={$i18n.t('Enter proxy URL (e.g. https://user:password@host:port)')}
bind:value={youtubeProxyUrl}
autocomplete="off"
/>
</div>
</div>
</div>
</div>
{/if}
</div>