mirror of
https://github.com/open-webui/open-webui
synced 2025-06-26 18:26:48 +00:00
Finalize incomplete merge to update playwright branch
Introduced feature parity for trust_env
This commit is contained in:
parent
4da220c513
commit
aa2b764d74
@ -1,30 +1,33 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from datetime import datetime, time, timedelta
|
import logging
|
||||||
import socket
|
import socket
|
||||||
import ssl
|
import ssl
|
||||||
import aiohttp
|
|
||||||
import asyncio
|
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, time, timedelta
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
AsyncIterator,
|
||||||
|
Dict,
|
||||||
|
Iterator,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Union
|
||||||
|
)
|
||||||
|
import aiohttp
|
||||||
import certifi
|
import certifi
|
||||||
import validators
|
import validators
|
||||||
from collections import defaultdict
|
|
||||||
from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator
|
|
||||||
from typing import Any, AsyncIterator, Dict, Iterator, List, Sequence, Union
|
|
||||||
|
|
||||||
|
|
||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
WebBaseLoader,
|
PlaywrightURLLoader,
|
||||||
PlaywrightURLLoader
|
WebBaseLoader
|
||||||
)
|
)
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
|
||||||
from open_webui.constants import ERROR_MESSAGES
|
from open_webui.constants import ERROR_MESSAGES
|
||||||
from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH, PLAYWRIGHT_WS_URI, RAG_WEB_LOADER
|
from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH, PLAYWRIGHT_WS_URI, RAG_WEB_LOADER
|
||||||
from open_webui.env import SRC_LOG_LEVELS
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||||
|
|
||||||
@ -91,18 +94,20 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader):
|
|||||||
"""Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.
|
"""Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
urls (List[str]): List of URLs to load.
|
web_paths (List[str]): List of URLs to load.
|
||||||
verify_ssl (bool): If True, verify SSL certificates.
|
verify_ssl (bool): If True, verify SSL certificates.
|
||||||
requests_per_second (Optional[float]): Number of requests per second to limit to.
|
requests_per_second (Optional[float]): Number of requests per second to limit to.
|
||||||
continue_on_failure (bool): If True, continue loading other URLs on failure.
|
continue_on_failure (bool): If True, continue loading other URLs on failure.
|
||||||
headless (bool): If True, the browser will run in headless mode.
|
headless (bool): If True, the browser will run in headless mode.
|
||||||
playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
|
playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
|
||||||
|
trust_env (bool): If True, use proxy settings from environment variables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
web_paths: List[str],
|
||||||
verify_ssl: bool = True,
|
verify_ssl: bool = True,
|
||||||
|
trust_env: bool = False,
|
||||||
requests_per_second: Optional[float] = None,
|
requests_per_second: Optional[float] = None,
|
||||||
continue_on_failure: bool = True,
|
continue_on_failure: bool = True,
|
||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
@ -111,9 +116,20 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader):
|
|||||||
playwright_ws_url: Optional[str] = None
|
playwright_ws_url: Optional[str] = None
|
||||||
):
|
):
|
||||||
"""Initialize with additional safety parameters and remote browser support."""
|
"""Initialize with additional safety parameters and remote browser support."""
|
||||||
|
|
||||||
|
proxy_server = proxy.get('server') if proxy else None
|
||||||
|
if trust_env and not proxy_server:
|
||||||
|
env_proxies = urllib.request.getproxies()
|
||||||
|
env_proxy_server = env_proxies.get('https') or env_proxies.get('http')
|
||||||
|
if env_proxy_server:
|
||||||
|
if proxy:
|
||||||
|
proxy['server'] = env_proxy_server
|
||||||
|
else:
|
||||||
|
proxy = { 'server': env_proxy_server }
|
||||||
|
|
||||||
# We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
|
# We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
|
||||||
super().__init__(
|
super().__init__(
|
||||||
urls=urls,
|
urls=web_paths,
|
||||||
continue_on_failure=continue_on_failure,
|
continue_on_failure=continue_on_failure,
|
||||||
headless=headless if playwright_ws_url is None else False,
|
headless=headless if playwright_ws_url is None else False,
|
||||||
remove_selectors=remove_selectors,
|
remove_selectors=remove_selectors,
|
||||||
@ -123,6 +139,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader):
|
|||||||
self.requests_per_second = requests_per_second
|
self.requests_per_second = requests_per_second
|
||||||
self.last_request_time = None
|
self.last_request_time = None
|
||||||
self.playwright_ws_url = playwright_ws_url
|
self.playwright_ws_url = playwright_ws_url
|
||||||
|
self.trust_env = trust_env
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Safely load URLs synchronously with support for remote browser."""
|
"""Safely load URLs synchronously with support for remote browser."""
|
||||||
@ -347,14 +364,12 @@ def get_web_loader(
|
|||||||
# Check if the URLs are valid
|
# Check if the URLs are valid
|
||||||
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
|
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
|
||||||
|
|
||||||
|
|
||||||
web_loader_args = {
|
web_loader_args = {
|
||||||
web_path=safe_urls,
|
"web_paths": safe_urls,
|
||||||
"urls": safe_urls,
|
|
||||||
"verify_ssl": verify_ssl,
|
"verify_ssl": verify_ssl,
|
||||||
"requests_per_second": requests_per_second,
|
"requests_per_second": requests_per_second,
|
||||||
"continue_on_failure": True,
|
"continue_on_failure": True,
|
||||||
trust_env=trust_env
|
"trust_env": trust_env
|
||||||
}
|
}
|
||||||
|
|
||||||
if PLAYWRIGHT_WS_URI.value:
|
if PLAYWRIGHT_WS_URI.value:
|
||||||
@ -364,6 +379,6 @@ def get_web_loader(
|
|||||||
WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value]
|
WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value]
|
||||||
web_loader = WebLoaderClass(**web_loader_args)
|
web_loader = WebLoaderClass(**web_loader_args)
|
||||||
|
|
||||||
log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(urls))
|
log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(safe_urls))
|
||||||
|
|
||||||
return web_loader
|
return web_loader
|
Loading…
Reference in New Issue
Block a user