From 8da33721d563754becd0d03bf86605441e0bd9e3 Mon Sep 17 00:00:00 2001 From: Rory <16675082+roryeckel@users.noreply.github.com> Date: Sun, 2 Feb 2025 17:58:09 -0600 Subject: [PATCH] Support PLAYWRIGHT_WS_URI --- backend/open_webui/config.py | 6 ++ backend/open_webui/main.py | 2 + backend/open_webui/retrieval/web/utils.py | 121 ++++++++++++++-------- backend/start.sh | 8 +- backend/start_windows.bat | 8 +- 5 files changed, 97 insertions(+), 48 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 278f50663..80e1e7ab2 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1724,6 +1724,12 @@ RAG_WEB_LOADER = PersistentConfig( os.environ.get("RAG_WEB_LOADER", "safe_web") ) +PLAYWRIGHT_WS_URI = PersistentConfig( + "PLAYWRIGHT_WS_URI", + "rag.web.loader.playwright.ws.uri", + os.environ.get("PLAYWRIGHT_WS_URI", None) +) + #################################### # Images #################################### diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index c5dfad047..fd8a4c957 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -129,6 +129,7 @@ from open_webui.config import ( AUDIO_TTS_VOICE, AUDIO_TTS_AZURE_SPEECH_REGION, AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, + PLAYWRIGHT_WS_URI, RAG_WEB_LOADER, WHISPER_MODEL, WHISPER_MODEL_AUTO_UPDATE, @@ -528,6 +529,7 @@ app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = BING_SEARCH_V7_SUBSCRIPTION_K app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = RAG_WEB_SEARCH_RESULT_COUNT app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_REQUESTS app.state.config.RAG_WEB_LOADER = RAG_WEB_LOADER +app.state.config.PLAYWRIGHT_WS_URI = PLAYWRIGHT_WS_URI app.state.EMBEDDING_FUNCTION = None app.state.ef = None diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 0568c795c..3c77402c3 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -16,7 +16,7 @@ from langchain_core.documents import Document from open_webui.constants import ERROR_MESSAGES -from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH, RAG_WEB_LOADER +from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH, PLAYWRIGHT_WS_URI, RAG_WEB_LOADER from open_webui.env import SRC_LOG_LEVELS import logging @@ -83,7 +83,7 @@ def extract_metadata(soup, url): return metadata class SafePlaywrightURLLoader(PlaywrightURLLoader): - """Load HTML pages safely with Playwright, supporting SSL verification and rate limiting. + """Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection. Attributes: urls (List[str]): List of URLs to load. @@ -91,6 +91,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader): requests_per_second (Optional[float]): Number of requests per second to limit to. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. + playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection. """ def __init__( @@ -101,19 +102,80 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader): continue_on_failure: bool = True, headless: bool = True, remove_selectors: Optional[List[str]] = None, - proxy: Optional[Dict[str, str]] = None + proxy: Optional[Dict[str, str]] = None, + playwright_ws_url: Optional[str] = None ): - """Initialize with additional safety parameters.""" + """Initialize with additional safety parameters and remote browser support.""" + # We'll set headless to False if using playwright_ws_url since it's handled by the remote browser super().__init__( urls=urls, continue_on_failure=continue_on_failure, - headless=headless, + headless=headless if playwright_ws_url is None else False, remove_selectors=remove_selectors, proxy=proxy ) self.verify_ssl = verify_ssl self.requests_per_second = requests_per_second self.last_request_time = None + self.playwright_ws_url = playwright_ws_url + + def lazy_load(self) -> Iterator[Document]: + """Safely load URLs synchronously with support for remote browser.""" + from playwright.sync_api import sync_playwright + + with sync_playwright() as p: + # Use remote browser if ws_endpoint is provided, otherwise use local browser + if self.playwright_ws_url: + browser = p.chromium.connect(self.playwright_ws_url) + else: + browser = p.chromium.launch(headless=self.headless, proxy=self.proxy) + + for url in self.urls: + try: + self._safe_process_url_sync(url) + page = browser.new_page() + response = page.goto(url) + if response is None: + raise ValueError(f"page.goto() returned None for url {url}") + + text = self.evaluator.evaluate(page, browser, response) + metadata = {"source": url} + yield Document(page_content=text, metadata=metadata) + except Exception as e: + if self.continue_on_failure: + log.exception(e, "Error loading %s", url) + continue + raise e + browser.close() + + async def alazy_load(self) -> AsyncIterator[Document]: + """Safely load URLs asynchronously with support for remote browser.""" + from playwright.async_api import async_playwright + + async with async_playwright() as p: + # Use remote browser if ws_endpoint is provided, otherwise use local browser + if self.playwright_ws_url: + browser = await p.chromium.connect(self.playwright_ws_url) + else: + browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy) + + for url in self.urls: + try: + await self._safe_process_url(url) + page = await browser.new_page() + response = await page.goto(url) + if response is None: + raise ValueError(f"page.goto() returned None for url {url}") + + text = await self.evaluator.evaluate_async(page, browser, response) + metadata = {"source": url} + yield Document(page_content=text, metadata=metadata) + except Exception as e: + if self.continue_on_failure: + log.exception(e, "Error loading %s", url) + continue + raise e + await browser.close() def _verify_ssl_cert(self, url: str) -> bool: """Verify SSL certificate for the given URL.""" @@ -164,36 +226,6 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader): self._sync_wait_for_rate_limit() return True - async def alazy_load(self) -> AsyncIterator[Document]: - """Safely load URLs asynchronously.""" - parent_iterator = super().alazy_load() - - async for document in parent_iterator: - url = document.metadata["source"] - try: - await self._safe_process_url(url) - yield document - except Exception as e: - if self.continue_on_failure: - log.exception(e, "Error loading %s", url) - continue - raise e - - def lazy_load(self) -> Iterator[Document]: - """Safely load URLs synchronously.""" - parent_iterator = super().lazy_load() - - for document in parent_iterator: - url = document.metadata["source"] - try: - self._safe_process_url_sync(url) - yield document - except Exception as e: - if self.continue_on_failure: - log.exception(e, "Error loading %s", url) - continue - raise e - class SafeWebBaseLoader(WebBaseLoader): """WebBaseLoader with enhanced error handling for URLs.""" @@ -224,14 +256,19 @@ def get_web_loader( # Check if the URLs are valid safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls) - # Get the appropriate WebLoader based on the configuration + web_loader_args = { + "urls": safe_urls, + "verify_ssl": verify_ssl, + "requests_per_second": requests_per_second, + "continue_on_failure": True + } + + if PLAYWRIGHT_WS_URI.value: + web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value + + # Create the appropriate WebLoader based on the configuration WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value] - web_loader = WebLoaderClass( - safe_urls, - verify_ssl=verify_ssl, - requests_per_second=requests_per_second, - continue_on_failure=True, - ) + web_loader = WebLoaderClass(**web_loader_args) log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(safe_urls)) diff --git a/backend/start.sh b/backend/start.sh index 2501f413f..3b08cf549 100755 --- a/backend/start.sh +++ b/backend/start.sh @@ -5,9 +5,11 @@ cd "$SCRIPT_DIR" || exit # Add conditional Playwright browser installation if [[ "${RAG_WEB_LOADER,,}" == "playwright" ]]; then - echo "Installing Playwright browsers..." - playwright install chromium - playwright install-deps chromium + if [[ -z "${PLAYWRIGHT_WS_URI}" ]]; then + echo "Installing Playwright browsers..." + playwright install chromium + playwright install-deps chromium + fi python -c "import nltk; nltk.download('punkt_tab')" fi diff --git a/backend/start_windows.bat b/backend/start_windows.bat index 0f2792cc0..036e1f721 100644 --- a/backend/start_windows.bat +++ b/backend/start_windows.bat @@ -8,9 +8,11 @@ cd /d "%SCRIPT_DIR%" || exit /b :: Add conditional Playwright browser installation IF /I "%RAG_WEB_LOADER%" == "playwright" ( - echo Installing Playwright browsers... - playwright install chromium - playwright install-deps chromium + IF "%PLAYWRIGHT_WS_URI%" == "" ( + echo Installing Playwright browsers... + playwright install chromium + playwright install-deps chromium + ) python -c "import nltk; nltk.download('punkt_tab')" )