From 3db6b4352fdb75d469fc58e272d2de2e0b071b5c Mon Sep 17 00:00:00 2001 From: Rory <16675082+roryeckel@users.noreply.github.com> Date: Mon, 3 Feb 2025 18:18:49 -0600 Subject: [PATCH] fix: Filter out invalid RAG web URLs (continued) --- backend/open_webui/retrieval/web/utils.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index a322bbbfc..d91c7da15 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -42,6 +42,15 @@ def validate_url(url: Union[str, Sequence[str]]): else: return False +def safe_validate_urls(url: Sequence[str]) -> Sequence[str]: + valid_urls = [] + for u in url: + try: + if validate_url(u): + valid_urls.append(u) + except ValueError: + continue + return valid_urls def resolve_hostname(hostname): # Get address information @@ -86,11 +95,11 @@ def get_web_loader( verify_ssl: bool = True, requests_per_second: int = 2, ): - # Check if the URL is valid - if not validate_url(urls): - raise ValueError(ERROR_MESSAGES.INVALID_URL) + # Check if the URLs are valid + safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls) + return SafeWebBaseLoader( - urls, + safe_urls, verify_ssl=verify_ssl, requests_per_second=requests_per_second, continue_on_failure=True,