diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index d153c7dda..c25e0e046 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2081,6 +2081,12 @@ PLAYWRIGHT_WS_URI = PersistentConfig( os.environ.get("PLAYWRIGHT_WS_URI", None), ) +PLAYWRIGHT_GOTO_TIMEOUT = PersistentConfig( + "PLAYWRIGHT_GOTO_TIMEOUT", + "rag.web.loader.engine.playwright.goto.timeout", + int(os.environ.get("PLAYWRIGHT_GOTO_TIMEOUT", "10")), +) + FIRECRAWL_API_KEY = PersistentConfig( "FIRECRAWL_API_KEY", "firecrawl.api_key", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 1ea79aa26..228c92e64 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -155,6 +155,7 @@ from open_webui.config import ( AUDIO_TTS_AZURE_SPEECH_REGION, AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, PLAYWRIGHT_WS_URI, + PLAYWRIGHT_GOTO_TIMEOUT, FIRECRAWL_API_BASE_URL, FIRECRAWL_API_KEY, RAG_WEB_LOADER_ENGINE, @@ -629,6 +630,7 @@ app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_ app.state.config.RAG_WEB_LOADER_ENGINE = RAG_WEB_LOADER_ENGINE app.state.config.RAG_WEB_SEARCH_TRUST_ENV = RAG_WEB_SEARCH_TRUST_ENV app.state.config.PLAYWRIGHT_WS_URI = PLAYWRIGHT_WS_URI +app.state.config.PLAYWRIGHT_GOTO_TIMEOUT = PLAYWRIGHT_GOTO_TIMEOUT app.state.config.FIRECRAWL_API_BASE_URL = FIRECRAWL_API_BASE_URL app.state.config.FIRECRAWL_API_KEY = FIRECRAWL_API_KEY app.state.config.TAVILY_EXTRACT_DEPTH = TAVILY_EXTRACT_DEPTH diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 2b1346d7b..0eee00879 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -29,6 +29,7 @@ from open_webui.constants import ERROR_MESSAGES from open_webui.config import ( ENABLE_RAG_LOCAL_WEB_FETCH, PLAYWRIGHT_WS_URI, + PLAYWRIGHT_GOTO_TIMEOUT, RAG_WEB_LOADER_ENGINE, FIRECRAWL_API_BASE_URL, FIRECRAWL_API_KEY, @@ -376,6 +377,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing headless (bool): If True, the browser will run in headless mode. proxy (dict): Proxy override settings for the Playwright session. playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection. + playwright_goto_timeout (Optional[int]): Maximum operation time in milliseconds. """ def __init__( @@ -389,6 +391,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing remove_selectors: Optional[List[str]] = None, proxy: Optional[Dict[str, str]] = None, playwright_ws_url: Optional[str] = None, + playwright_goto_timeout: Optional[int] = 10000, ): """Initialize with additional safety parameters and remote browser support.""" @@ -415,6 +418,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing self.last_request_time = None self.playwright_ws_url = playwright_ws_url self.trust_env = trust_env + self.playwright_goto_timeout = playwright_goto_timeout def lazy_load(self) -> Iterator[Document]: """Safely load URLs synchronously with support for remote browser.""" @@ -431,7 +435,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing try: self._safe_process_url_sync(url) page = browser.new_page() - response = page.goto(url) + response = page.goto(url, timeout=self.playwright_goto_timeout) if response is None: raise ValueError(f"page.goto() returned None for url {url}") @@ -462,7 +466,9 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing try: await self._safe_process_url(url) page = await browser.new_page() - response = await page.goto(url) + response = await page.goto( + url, timeout=self.playwright_goto_timeout + ) if response is None: raise ValueError(f"page.goto() returned None for url {url}") @@ -604,8 +610,12 @@ def get_web_loader( "trust_env": trust_env, } - if PLAYWRIGHT_WS_URI.value: - web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value + if RAG_WEB_LOADER_ENGINE.value == "playwright": + web_loader_args["playwright_goto_timeout"] = ( + PLAYWRIGHT_GOTO_TIMEOUT.value * 1000 + ) + if PLAYWRIGHT_WS_URI.value: + web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value if RAG_WEB_LOADER_ENGINE.value == "firecrawl": web_loader_args["api_key"] = FIRECRAWL_API_KEY.value