From aa2b764d743f348f6d380aa43fd2184e49ab6745 Mon Sep 17 00:00:00 2001 From: Rory <16675082+roryeckel@users.noreply.github.com> Date: Fri, 14 Feb 2025 22:32:45 -0600 Subject: [PATCH] Finalize incomplete merge to update playwright branch Introduced feature parity for trust_env --- backend/open_webui/retrieval/web/utils.py | 59 ++++++++++++++--------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 25ca5aef5..b9fbbee3a 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -1,30 +1,33 @@ import asyncio -from datetime import datetime, time, timedelta +import logging import socket import ssl -import aiohttp -import asyncio import urllib.parse +import urllib.request +from collections import defaultdict +from datetime import datetime, time, timedelta +from typing import ( + Any, + AsyncIterator, + Dict, + Iterator, + List, + Optional, + Sequence, + Union +) +import aiohttp import certifi import validators -from collections import defaultdict -from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator -from typing import Any, AsyncIterator, Dict, Iterator, List, Sequence, Union - - from langchain_community.document_loaders import ( - WebBaseLoader, - PlaywrightURLLoader + PlaywrightURLLoader, + WebBaseLoader ) from langchain_core.documents import Document - - from open_webui.constants import ERROR_MESSAGES from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH, PLAYWRIGHT_WS_URI, RAG_WEB_LOADER from open_webui.env import SRC_LOG_LEVELS -import logging - log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) @@ -91,18 +94,20 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader): """Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection. Attributes: - urls (List[str]): List of URLs to load. + web_paths (List[str]): List of URLs to load. verify_ssl (bool): If True, verify SSL certificates. requests_per_second (Optional[float]): Number of requests per second to limit to. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection. + trust_env (bool): If True, use proxy settings from environment variables. """ def __init__( self, - urls: List[str], + web_paths: List[str], verify_ssl: bool = True, + trust_env: bool = False, requests_per_second: Optional[float] = None, continue_on_failure: bool = True, headless: bool = True, @@ -111,9 +116,20 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader): playwright_ws_url: Optional[str] = None ): """Initialize with additional safety parameters and remote browser support.""" + + proxy_server = proxy.get('server') if proxy else None + if trust_env and not proxy_server: + env_proxies = urllib.request.getproxies() + env_proxy_server = env_proxies.get('https') or env_proxies.get('http') + if env_proxy_server: + if proxy: + proxy['server'] = env_proxy_server + else: + proxy = { 'server': env_proxy_server } + # We'll set headless to False if using playwright_ws_url since it's handled by the remote browser super().__init__( - urls=urls, + urls=web_paths, continue_on_failure=continue_on_failure, headless=headless if playwright_ws_url is None else False, remove_selectors=remove_selectors, @@ -123,6 +139,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader): self.requests_per_second = requests_per_second self.last_request_time = None self.playwright_ws_url = playwright_ws_url + self.trust_env = trust_env def lazy_load(self) -> Iterator[Document]: """Safely load URLs synchronously with support for remote browser.""" @@ -347,14 +364,12 @@ def get_web_loader( # Check if the URLs are valid safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls) - web_loader_args = { - web_path=safe_urls, - "urls": safe_urls, + "web_paths": safe_urls, "verify_ssl": verify_ssl, "requests_per_second": requests_per_second, "continue_on_failure": True, - trust_env=trust_env + "trust_env": trust_env } if PLAYWRIGHT_WS_URI.value: @@ -364,6 +379,6 @@ def get_web_loader( WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value] web_loader = WebLoaderClass(**web_loader_args) - log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(urls)) + log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(safe_urls)) return web_loader \ No newline at end of file