mirror of
https://github.com/open-webui/open-webui
synced 2025-05-01 03:22:28 +00:00
Refine RAG_WEB_LOADER
This commit is contained in:
parent
8dafe3cba8
commit
2452e271cd
@ -5,6 +5,7 @@ import ssl
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
import certifi
|
import certifi
|
||||||
import validators
|
import validators
|
||||||
|
from collections import defaultdict
|
||||||
from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator
|
from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator
|
||||||
|
|
||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
@ -211,28 +212,27 @@ class SafeWebBaseLoader(WebBaseLoader):
|
|||||||
# Log the error and continue with the next URL
|
# Log the error and continue with the next URL
|
||||||
log.error(f"Error loading {path}: {e}")
|
log.error(f"Error loading {path}: {e}")
|
||||||
|
|
||||||
|
RAG_WEB_LOADERS = defaultdict(lambda: SafeWebBaseLoader)
|
||||||
|
RAG_WEB_LOADERS["playwright"] = SafePlaywrightURLLoader
|
||||||
|
RAG_WEB_LOADERS["safe_web"] = SafeWebBaseLoader
|
||||||
|
|
||||||
def get_web_loader(
|
def get_web_loader(
|
||||||
urls: Union[str, Sequence[str]],
|
urls: Union[str, Sequence[str]],
|
||||||
verify_ssl: bool = True,
|
verify_ssl: bool = True,
|
||||||
requests_per_second: int = 2,
|
requests_per_second: int = 2,
|
||||||
):
|
):
|
||||||
# Check if the URL is valid
|
# Check if the URLs are valid
|
||||||
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
|
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
|
||||||
|
|
||||||
if RAG_WEB_LOADER.value == "chromium":
|
# Get the appropriate WebLoader based on the configuration
|
||||||
log.info("Using SafePlaywrightURLLoader")
|
WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value]
|
||||||
return SafePlaywrightURLLoader(
|
web_loader = WebLoaderClass(
|
||||||
safe_urls,
|
safe_urls,
|
||||||
verify_ssl=verify_ssl,
|
verify_ssl=verify_ssl,
|
||||||
requests_per_second=requests_per_second,
|
requests_per_second=requests_per_second,
|
||||||
continue_on_failure=True,
|
continue_on_failure=True,
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
log.info("Using SafeWebBaseLoader")
|
log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(safe_urls))
|
||||||
return SafeWebBaseLoader(
|
|
||||||
safe_urls,
|
return web_loader
|
||||||
verify_ssl=verify_ssl,
|
|
||||||
requests_per_second=requests_per_second,
|
|
||||||
continue_on_failure=True,
|
|
||||||
)
|
|
@ -4,7 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
|||||||
cd "$SCRIPT_DIR" || exit
|
cd "$SCRIPT_DIR" || exit
|
||||||
|
|
||||||
# Add conditional Playwright browser installation
|
# Add conditional Playwright browser installation
|
||||||
if [[ "${RAG_WEB_LOADER,,}" == "chromium" ]]; then
|
if [[ "${RAG_WEB_LOADER,,}" == "playwright" ]]; then
|
||||||
echo "Installing Playwright browsers..."
|
echo "Installing Playwright browsers..."
|
||||||
playwright install chromium
|
playwright install chromium
|
||||||
playwright install-deps chromium
|
playwright install-deps chromium
|
||||||
|
@ -7,7 +7,7 @@ SET "SCRIPT_DIR=%~dp0"
|
|||||||
cd /d "%SCRIPT_DIR%" || exit /b
|
cd /d "%SCRIPT_DIR%" || exit /b
|
||||||
|
|
||||||
:: Add conditional Playwright browser installation
|
:: Add conditional Playwright browser installation
|
||||||
IF /I "%RAG_WEB_LOADER%" == "chromium" (
|
IF /I "%RAG_WEB_LOADER%" == "playwright" (
|
||||||
echo Installing Playwright browsers...
|
echo Installing Playwright browsers...
|
||||||
playwright install chromium
|
playwright install chromium
|
||||||
playwright install-deps chromium
|
playwright install-deps chromium
|
||||||
|
Loading…
Reference in New Issue
Block a user