Refine RAG_WEB_LOADER

This commit is contained in:
Rory 2025-01-30 20:31:31 -06:00
parent 8dafe3cba8
commit 2452e271cd
3 changed files with 19 additions and 19 deletions

View File

@ -5,6 +5,7 @@ import ssl
import urllib.parse import urllib.parse
import certifi import certifi
import validators import validators
from collections import defaultdict
from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator from typing import AsyncIterator, Dict, List, Optional, Union, Sequence, Iterator
from langchain_community.document_loaders import ( from langchain_community.document_loaders import (
@ -211,28 +212,27 @@ class SafeWebBaseLoader(WebBaseLoader):
# Log the error and continue with the next URL # Log the error and continue with the next URL
log.error(f"Error loading {path}: {e}") log.error(f"Error loading {path}: {e}")
RAG_WEB_LOADERS = defaultdict(lambda: SafeWebBaseLoader)
RAG_WEB_LOADERS["playwright"] = SafePlaywrightURLLoader
RAG_WEB_LOADERS["safe_web"] = SafeWebBaseLoader
def get_web_loader( def get_web_loader(
urls: Union[str, Sequence[str]], urls: Union[str, Sequence[str]],
verify_ssl: bool = True, verify_ssl: bool = True,
requests_per_second: int = 2, requests_per_second: int = 2,
): ):
# Check if the URL is valid # Check if the URLs are valid
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls) safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
if RAG_WEB_LOADER.value == "chromium": # Get the appropriate WebLoader based on the configuration
log.info("Using SafePlaywrightURLLoader") WebLoaderClass = RAG_WEB_LOADERS[RAG_WEB_LOADER.value]
return SafePlaywrightURLLoader( web_loader = WebLoaderClass(
safe_urls, safe_urls,
verify_ssl=verify_ssl, verify_ssl=verify_ssl,
requests_per_second=requests_per_second, requests_per_second=requests_per_second,
continue_on_failure=True, continue_on_failure=True,
) )
else:
log.info("Using SafeWebBaseLoader") log.debug("Using RAG_WEB_LOADER %s for %s URLs", web_loader.__class__.__name__, len(safe_urls))
return SafeWebBaseLoader(
safe_urls, return web_loader
verify_ssl=verify_ssl,
requests_per_second=requests_per_second,
continue_on_failure=True,
)

View File

@ -4,7 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR" || exit cd "$SCRIPT_DIR" || exit
# Add conditional Playwright browser installation # Add conditional Playwright browser installation
if [[ "${RAG_WEB_LOADER,,}" == "chromium" ]]; then if [[ "${RAG_WEB_LOADER,,}" == "playwright" ]]; then
echo "Installing Playwright browsers..." echo "Installing Playwright browsers..."
playwright install chromium playwright install chromium
playwright install-deps chromium playwright install-deps chromium

View File

@ -7,7 +7,7 @@ SET "SCRIPT_DIR=%~dp0"
cd /d "%SCRIPT_DIR%" || exit /b cd /d "%SCRIPT_DIR%" || exit /b
:: Add conditional Playwright browser installation :: Add conditional Playwright browser installation
IF /I "%RAG_WEB_LOADER%" == "chromium" ( IF /I "%RAG_WEB_LOADER%" == "playwright" (
echo Installing Playwright browsers... echo Installing Playwright browsers...
playwright install chromium playwright install chromium
playwright install-deps chromium playwright install-deps chromium