diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index bdc626749..3c0c34074 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -5,6 +5,7 @@ import ssl import urllib.parse import certifi import validators +from collections import defaultdict from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator from langchain_community.document_loaders import ( @@ -211,28 +212,27 @@ class SafeWebBaseLoader(WebBaseLoader): # Log the error and continue with the next URL log.error(f"Error loading {path}: {e}") +RAG_WEB_LOADERS = defaultdict(lambda: SafeWebBaseLoader) +RAG_WEB_LOADERS["playwright"] = SafePlaywrightURLLoader +RAG_WEB_LOADERS["safe_web"] = SafeWebBaseLoader def get_web_loader( urls: Union[str, Sequence[str]], verify_ssl: bool = True, requests_per_second: int = 2, ): - # Check if the URL is valid + # Check if the URLs are valid safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls) - if RAG_WEB_LOADER.value == "chromium": - log.info("Using SafePlaywrightURLLoader") - return SafePlaywrightURLLoader( - safe_urls, - verify_ssl=verify_ssl, - requests_per_second=requests_per_second, - continue_on_failure=True, - ) - else: - log.info("Using SafeWebBaseLoader") - return SafeWebBaseLoader( - safe_urls, - verify_ssl=verify_ssl, - requests_per_second=requests_per_second, - continue_on_failure=True, - ) + # Get the appropriate WebLoader based on the configuration + WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value] + web_loader = WebLoaderClass( + safe_urls, + verify_ssl=verify_ssl, + requests_per_second=requests_per_second, + continue_on_failure=True, + ) + + log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(safe_urls)) + + return web_loader \ No newline at end of file diff --git a/backend/start.sh b/backend/start.sh index ce56b1867..2501f413f 100755 --- a/backend/start.sh +++ b/backend/start.sh @@ -4,7 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd "$SCRIPT_DIR" || exit # Add conditional Playwright browser installation -if [[ "${RAG_WEB_LOADER,,}" == "chromium" ]]; then +if [[ "${RAG_WEB_LOADER,,}" == "playwright" ]]; then echo "Installing Playwright browsers..." playwright install chromium playwright install-deps chromium diff --git a/backend/start_windows.bat b/backend/start_windows.bat index 3b6446258..0f2792cc0 100644 --- a/backend/start_windows.bat +++ b/backend/start_windows.bat @@ -7,7 +7,7 @@ SET "SCRIPT_DIR=%~dp0" cd /d "%SCRIPT_DIR%" || exit /b :: Add conditional Playwright browser installation -IF /I "%RAG_WEB_LOADER%" == "chromium" ( +IF /I "%RAG_WEB_LOADER%" == "playwright" ( echo Installing Playwright browsers... playwright install chromium playwright install-deps chromium