Fixed the issue where a single URL error disrupts the data loading process in Web Search mode

To address the unresolved issue in the LangChain library where a single URL error disrupts the data loading process, the lazy_load method in the WebBaseLoader class has been modified. The enhanced method now handles exceptions appropriately, logging errors and continuing with the remaining URLs.
This commit is contained in:
Que Nguyen 2024-06-11 22:06:14 +07:00 committed by GitHub
parent bf1936de34
commit 3bec60b80c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -12,9 +12,10 @@ import os, shutil, logging, re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Union, Sequence from typing import List, Union, Sequence, Iterator, Any
from chromadb.utils.batch_utils import create_batches from chromadb.utils.batch_utils import create_batches
from langchain_core.documents import Document
from langchain_community.document_loaders import ( from langchain_community.document_loaders import (
WebBaseLoader, WebBaseLoader,
@ -701,7 +702,7 @@ def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True):
# Check if the URL is valid # Check if the URL is valid
if not validate_url(url): if not validate_url(url):
raise ValueError(ERROR_MESSAGES.INVALID_URL) raise ValueError(ERROR_MESSAGES.INVALID_URL)
return WebBaseLoader( return SafeWebBaseLoader(
url, url,
verify_ssl=verify_ssl, verify_ssl=verify_ssl,
requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS, requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
@ -1237,6 +1238,28 @@ def reset(user=Depends(get_admin_user)) -> bool:
return True return True
class SafeWebBaseLoader(WebBaseLoader):
"""WebBaseLoader with enhanced error handling for URLs."""
def lazy_load(self) -> Iterator[Document]:
"""Lazy load text from the url(s) in web_path with error handling."""
for path in self.web_paths:
try:
soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
text = soup.get_text(**self.bs_get_text_kwargs)
# Build metadata
metadata = {"source": path}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get("content", "No description found.")
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
yield Document(page_content=text, metadata=metadata)
except Exception as e:
# Log the error and continue with the next URL
log.error(f"Error loading {path}: {e}")
if ENV == "dev": if ENV == "dev":