Merge pull request #9314 from roryeckel/validate-rag-urls

Validate URLs returned by search engine
This commit is contained in:
Timothy Jaeryang Baek 2025-02-03 20:26:34 -08:00 committed by GitHub
commit 1b8dc673e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 17 additions and 4 deletions

View File

@ -1,3 +1,5 @@
import validators
from typing import Optional
from urllib.parse import urlparse
@ -10,6 +12,8 @@ def get_filtered_results(results, filter_list):
filtered_results = []
for result in results:
url = result.get("url") or result.get("link", "")
if not validators.url(url):
continue
domain = urlparse(url).netloc
if any(domain.endswith(filtered_domain) for filtered_domain in filter_list):
filtered_results.append(result)

View File

@ -42,6 +42,15 @@ def validate_url(url: Union[str, Sequence[str]]):
else:
return False
def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
valid_urls = []
for u in url:
try:
if validate_url(u):
valid_urls.append(u)
except ValueError:
continue
return valid_urls
def resolve_hostname(hostname):
# Get address information
@ -86,11 +95,11 @@ def get_web_loader(
verify_ssl: bool = True,
requests_per_second: int = 2,
):
# Check if the URL is valid
if not validate_url(urls):
raise ValueError(ERROR_MESSAGES.INVALID_URL)
# Check if the URLs are valid
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
return SafeWebBaseLoader(
urls,
safe_urls,
verify_ssl=verify_ssl,
requests_per_second=requests_per_second,
continue_on_failure=True,