mirror of
https://github.com/open-webui/open-webui
synced 2024-12-29 15:25:29 +00:00
[feat] Allow use of proxy for downloading Youtube transscripts
This commit is contained in:
parent
0a26c41c7b
commit
53296c1005
@ -1,7 +1,12 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
|
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||||
|
|
||||||
ALLOWED_SCHEMES = {"http", "https"}
|
ALLOWED_SCHEMES = {"http", "https"}
|
||||||
ALLOWED_NETLOCS = {
|
ALLOWED_NETLOCS = {
|
||||||
@ -51,12 +56,14 @@ class YoutubeLoader:
|
|||||||
self,
|
self,
|
||||||
video_id: str,
|
video_id: str,
|
||||||
language: Union[str, Sequence[str]] = "en",
|
language: Union[str, Sequence[str]] = "en",
|
||||||
|
proxy_url: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with YouTube video ID."""
|
"""Initialize with YouTube video ID."""
|
||||||
_video_id = _parse_video_id(video_id)
|
_video_id = _parse_video_id(video_id)
|
||||||
self.video_id = _video_id if _video_id is not None else video_id
|
self.video_id = _video_id if _video_id is not None else video_id
|
||||||
self._metadata = {"source": video_id}
|
self._metadata = {"source": video_id}
|
||||||
self.language = language
|
self.language = language
|
||||||
|
self.proxy_url = proxy_url
|
||||||
if isinstance(language, str):
|
if isinstance(language, str):
|
||||||
self.language = [language]
|
self.language = [language]
|
||||||
else:
|
else:
|
||||||
@ -76,10 +83,20 @@ class YoutubeLoader:
|
|||||||
"Please install it with `pip install youtube-transcript-api`."
|
"Please install it with `pip install youtube-transcript-api`."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.proxy_url:
|
||||||
|
youtube_proxies = {
|
||||||
|
'http': self.proxy_url,
|
||||||
|
'https': self.proxy_url,
|
||||||
|
}
|
||||||
|
# Don't log complete URL because it might contain secrets
|
||||||
|
log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
|
||||||
|
else:
|
||||||
|
youtube_proxies = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
|
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id, proxies=youtube_proxies)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
log.exception("Loading YouTube transcript failed")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -105,6 +105,7 @@ from open_webui.config import (
|
|||||||
TIKA_SERVER_URL,
|
TIKA_SERVER_URL,
|
||||||
UPLOAD_DIR,
|
UPLOAD_DIR,
|
||||||
YOUTUBE_LOADER_LANGUAGE,
|
YOUTUBE_LOADER_LANGUAGE,
|
||||||
|
YOUTUBE_LOADER_PROXY_URL,
|
||||||
DEFAULT_LOCALE,
|
DEFAULT_LOCALE,
|
||||||
AppConfig,
|
AppConfig,
|
||||||
)
|
)
|
||||||
@ -171,6 +172,7 @@ app.state.config.OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
|
|||||||
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
|
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
|
||||||
|
|
||||||
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
|
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
|
||||||
|
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
|
||||||
app.state.YOUTUBE_LOADER_TRANSLATION = None
|
app.state.YOUTUBE_LOADER_TRANSLATION = None
|
||||||
|
|
||||||
|
|
||||||
@ -471,6 +473,7 @@ async def get_rag_config(user=Depends(get_admin_user)):
|
|||||||
"youtube": {
|
"youtube": {
|
||||||
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
||||||
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
|
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
|
||||||
|
"proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
|
||||||
},
|
},
|
||||||
"web": {
|
"web": {
|
||||||
"web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
|
"web_loader_ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
|
||||||
@ -518,6 +521,7 @@ class ChunkParamUpdateForm(BaseModel):
|
|||||||
class YoutubeLoaderConfig(BaseModel):
|
class YoutubeLoaderConfig(BaseModel):
|
||||||
language: list[str]
|
language: list[str]
|
||||||
translation: Optional[str] = None
|
translation: Optional[str] = None
|
||||||
|
proxy_url: str = ""
|
||||||
|
|
||||||
|
|
||||||
class WebSearchConfig(BaseModel):
|
class WebSearchConfig(BaseModel):
|
||||||
@ -580,6 +584,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
|
|||||||
|
|
||||||
if form_data.youtube is not None:
|
if form_data.youtube is not None:
|
||||||
app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language
|
app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language
|
||||||
|
app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.youtube.proxy_url
|
||||||
app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation
|
app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation
|
||||||
|
|
||||||
if form_data.web is not None:
|
if form_data.web is not None:
|
||||||
@ -640,6 +645,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
|
|||||||
},
|
},
|
||||||
"youtube": {
|
"youtube": {
|
||||||
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
||||||
|
"proxy_url": app.state.config.YOUTUBE_LOADER_PROXY_URL,
|
||||||
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
|
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
|
||||||
},
|
},
|
||||||
"web": {
|
"web": {
|
||||||
@ -1081,7 +1087,9 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
|
|||||||
collection_name = calculate_sha256_string(form_data.url)[:63]
|
collection_name = calculate_sha256_string(form_data.url)[:63]
|
||||||
|
|
||||||
loader = YoutubeLoader(
|
loader = YoutubeLoader(
|
||||||
form_data.url, language=app.state.config.YOUTUBE_LOADER_LANGUAGE
|
form_data.url,
|
||||||
|
language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
||||||
|
proxy_url=app.state.config.YOUTUBE_LOADER_PROXY_URL,
|
||||||
)
|
)
|
||||||
|
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
@ -1259,6 +1259,12 @@ YOUTUBE_LOADER_LANGUAGE = PersistentConfig(
|
|||||||
os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","),
|
os.getenv("YOUTUBE_LOADER_LANGUAGE", "en").split(","),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
YOUTUBE_LOADER_PROXY_URL = PersistentConfig(
|
||||||
|
"YOUTUBE_LOADER_PROXY_URL",
|
||||||
|
"rag.youtube_loader_proxy_url",
|
||||||
|
os.getenv("YOUTUBE_LOADER_PROXY_URL", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
ENABLE_RAG_WEB_SEARCH = PersistentConfig(
|
ENABLE_RAG_WEB_SEARCH = PersistentConfig(
|
||||||
"ENABLE_RAG_WEB_SEARCH",
|
"ENABLE_RAG_WEB_SEARCH",
|
||||||
|
@ -40,6 +40,7 @@ type ContentExtractConfigForm = {
|
|||||||
type YoutubeConfigForm = {
|
type YoutubeConfigForm = {
|
||||||
language: string[];
|
language: string[];
|
||||||
translation?: string | null;
|
translation?: string | null;
|
||||||
|
proxy_url: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
type RAGConfigForm = {
|
type RAGConfigForm = {
|
||||||
|
@ -29,13 +29,15 @@
|
|||||||
|
|
||||||
let youtubeLanguage = 'en';
|
let youtubeLanguage = 'en';
|
||||||
let youtubeTranslation = null;
|
let youtubeTranslation = null;
|
||||||
|
let youtubeProxyUrl = '';
|
||||||
|
|
||||||
const submitHandler = async () => {
|
const submitHandler = async () => {
|
||||||
const res = await updateRAGConfig(localStorage.token, {
|
const res = await updateRAGConfig(localStorage.token, {
|
||||||
web: webConfig,
|
web: webConfig,
|
||||||
youtube: {
|
youtube: {
|
||||||
language: youtubeLanguage.split(',').map((lang) => lang.trim()),
|
language: youtubeLanguage.split(',').map((lang) => lang.trim()),
|
||||||
translation: youtubeTranslation
|
translation: youtubeTranslation,
|
||||||
|
proxy_url: youtubeProxyUrl
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
@ -48,6 +50,7 @@
|
|||||||
|
|
||||||
youtubeLanguage = res.youtube.language.join(',');
|
youtubeLanguage = res.youtube.language.join(',');
|
||||||
youtubeTranslation = res.youtube.translation;
|
youtubeTranslation = res.youtube.translation;
|
||||||
|
youtubeProxyUrl = res.youtube.proxy_url;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
@ -358,6 +361,21 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<div class=" py-0.5 flex w-full justify-between">
|
||||||
|
<div class=" w-20 text-xs font-medium self-center">{$i18n.t('Proxy URL')}</div>
|
||||||
|
<div class=" flex-1 self-center">
|
||||||
|
<input
|
||||||
|
class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
|
||||||
|
type="text"
|
||||||
|
placeholder={$i18n.t('Enter proxy URL (e.g. https://user:password@host:port)')}
|
||||||
|
bind:value={youtubeProxyUrl}
|
||||||
|
autocomplete="off"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user