Merge pull request #11899 from genjuro214/set-playwright-timeout

perf: set shorter timeout for playwright and make it configurable
This commit is contained in:
Timothy Jaeryang Baek 2025-03-20 14:00:59 -07:00 committed by GitHub
commit f066eea92e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 22 additions and 4 deletions

View File

@ -2081,6 +2081,12 @@ PLAYWRIGHT_WS_URI = PersistentConfig(
os.environ.get("PLAYWRIGHT_WS_URI", None), os.environ.get("PLAYWRIGHT_WS_URI", None),
) )
PLAYWRIGHT_GOTO_TIMEOUT = PersistentConfig(
"PLAYWRIGHT_GOTO_TIMEOUT",
"rag.web.loader.engine.playwright.goto.timeout",
int(os.environ.get("PLAYWRIGHT_GOTO_TIMEOUT", "10")),
)
FIRECRAWL_API_KEY = PersistentConfig( FIRECRAWL_API_KEY = PersistentConfig(
"FIRECRAWL_API_KEY", "FIRECRAWL_API_KEY",
"firecrawl.api_key", "firecrawl.api_key",

View File

@ -155,6 +155,7 @@ from open_webui.config import (
AUDIO_TTS_AZURE_SPEECH_REGION, AUDIO_TTS_AZURE_SPEECH_REGION,
AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
PLAYWRIGHT_WS_URI, PLAYWRIGHT_WS_URI,
PLAYWRIGHT_GOTO_TIMEOUT,
FIRECRAWL_API_BASE_URL, FIRECRAWL_API_BASE_URL,
FIRECRAWL_API_KEY, FIRECRAWL_API_KEY,
RAG_WEB_LOADER_ENGINE, RAG_WEB_LOADER_ENGINE,
@ -629,6 +630,7 @@ app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_
app.state.config.RAG_WEB_LOADER_ENGINE = RAG_WEB_LOADER_ENGINE app.state.config.RAG_WEB_LOADER_ENGINE = RAG_WEB_LOADER_ENGINE
app.state.config.RAG_WEB_SEARCH_TRUST_ENV = RAG_WEB_SEARCH_TRUST_ENV app.state.config.RAG_WEB_SEARCH_TRUST_ENV = RAG_WEB_SEARCH_TRUST_ENV
app.state.config.PLAYWRIGHT_WS_URI = PLAYWRIGHT_WS_URI app.state.config.PLAYWRIGHT_WS_URI = PLAYWRIGHT_WS_URI
app.state.config.PLAYWRIGHT_GOTO_TIMEOUT = PLAYWRIGHT_GOTO_TIMEOUT
app.state.config.FIRECRAWL_API_BASE_URL = FIRECRAWL_API_BASE_URL app.state.config.FIRECRAWL_API_BASE_URL = FIRECRAWL_API_BASE_URL
app.state.config.FIRECRAWL_API_KEY = FIRECRAWL_API_KEY app.state.config.FIRECRAWL_API_KEY = FIRECRAWL_API_KEY
app.state.config.TAVILY_EXTRACT_DEPTH = TAVILY_EXTRACT_DEPTH app.state.config.TAVILY_EXTRACT_DEPTH = TAVILY_EXTRACT_DEPTH

View File

@ -29,6 +29,7 @@ from open_webui.constants import ERROR_MESSAGES
from open_webui.config import ( from open_webui.config import (
ENABLE_RAG_LOCAL_WEB_FETCH, ENABLE_RAG_LOCAL_WEB_FETCH,
PLAYWRIGHT_WS_URI, PLAYWRIGHT_WS_URI,
PLAYWRIGHT_GOTO_TIMEOUT,
RAG_WEB_LOADER_ENGINE, RAG_WEB_LOADER_ENGINE,
FIRECRAWL_API_BASE_URL, FIRECRAWL_API_BASE_URL,
FIRECRAWL_API_KEY, FIRECRAWL_API_KEY,
@ -376,6 +377,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
headless (bool): If True, the browser will run in headless mode. headless (bool): If True, the browser will run in headless mode.
proxy (dict): Proxy override settings for the Playwright session. proxy (dict): Proxy override settings for the Playwright session.
playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection. playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
playwright_goto_timeout (Optional[int]): Maximum operation time in milliseconds.
""" """
def __init__( def __init__(
@ -389,6 +391,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
remove_selectors: Optional[List[str]] = None, remove_selectors: Optional[List[str]] = None,
proxy: Optional[Dict[str, str]] = None, proxy: Optional[Dict[str, str]] = None,
playwright_ws_url: Optional[str] = None, playwright_ws_url: Optional[str] = None,
playwright_goto_timeout: Optional[int] = 10000,
): ):
"""Initialize with additional safety parameters and remote browser support.""" """Initialize with additional safety parameters and remote browser support."""
@ -415,6 +418,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
self.last_request_time = None self.last_request_time = None
self.playwright_ws_url = playwright_ws_url self.playwright_ws_url = playwright_ws_url
self.trust_env = trust_env self.trust_env = trust_env
self.playwright_goto_timeout = playwright_goto_timeout
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
"""Safely load URLs synchronously with support for remote browser.""" """Safely load URLs synchronously with support for remote browser."""
@ -431,7 +435,7 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
try: try:
self._safe_process_url_sync(url) self._safe_process_url_sync(url)
page = browser.new_page() page = browser.new_page()
response = page.goto(url) response = page.goto(url, timeout=self.playwright_goto_timeout)
if response is None: if response is None:
raise ValueError(f"page.goto() returned None for url {url}") raise ValueError(f"page.goto() returned None for url {url}")
@ -462,7 +466,9 @@ class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessing
try: try:
await self._safe_process_url(url) await self._safe_process_url(url)
page = await browser.new_page() page = await browser.new_page()
response = await page.goto(url) response = await page.goto(
url, timeout=self.playwright_goto_timeout
)
if response is None: if response is None:
raise ValueError(f"page.goto() returned None for url {url}") raise ValueError(f"page.goto() returned None for url {url}")
@ -604,6 +610,10 @@ def get_web_loader(
"trust_env": trust_env, "trust_env": trust_env,
} }
if RAG_WEB_LOADER_ENGINE.value == "playwright":
web_loader_args["playwright_goto_timeout"] = (
PLAYWRIGHT_GOTO_TIMEOUT.value * 1000
)
if PLAYWRIGHT_WS_URI.value: if PLAYWRIGHT_WS_URI.value:
web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value