From 3bec60b80c58d573a886d9b3d33a40b0dec22e69 Mon Sep 17 00:00:00 2001 From: Que Nguyen Date: Tue, 11 Jun 2024 22:06:14 +0700 Subject: [PATCH] Fixed the issue where a single URL error disrupts the data loading process in Web Search mode To address the unresolved issue in the LangChain library where a single URL error disrupts the data loading process, the lazy_load method in the WebBaseLoader class has been modified. The enhanced method now handles exceptions appropriately, logging errors and continuing with the remaining URLs. --- backend/apps/rag/main.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 99730ad7c..9f41abee6 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -12,9 +12,10 @@ import os, shutil, logging, re from datetime import datetime from pathlib import Path -from typing import List, Union, Sequence +from typing import List, Union, Sequence, Iterator, Any from chromadb.utils.batch_utils import create_batches +from langchain_core.documents import Document from langchain_community.document_loaders import ( WebBaseLoader, @@ -701,7 +702,7 @@ def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True): # Check if the URL is valid if not validate_url(url): raise ValueError(ERROR_MESSAGES.INVALID_URL) - return WebBaseLoader( + return SafeWebBaseLoader( url, verify_ssl=verify_ssl, requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS, @@ -1237,7 +1238,29 @@ def reset(user=Depends(get_admin_user)) -> bool: return True +class SafeWebBaseLoader(WebBaseLoader): + """WebBaseLoader with enhanced error handling for URLs.""" + def lazy_load(self) -> Iterator[Document]: + """Lazy load text from the url(s) in web_path with error handling.""" + for path in self.web_paths: + try: + soup = self._scrape(path, bs_kwargs=self.bs_kwargs) + text = soup.get_text(**self.bs_get_text_kwargs) + # Build metadata + metadata = {"source": path} + if title := soup.find("title"): + metadata["title"] = title.get_text() + if description := soup.find("meta", attrs={"name": "description"}): + metadata["description"] = description.get("content", "No description found.") + if html := soup.find("html"): + metadata["language"] = html.get("lang", "No language found.") + + yield Document(page_content=text, metadata=metadata) + except Exception as e: + # Log the error and continue with the next URL + log.error(f"Error loading {path}: {e}") + if ENV == "dev": @app.get("/ef")