From 839ba22c90991dc4d72810aa6f6a527664db80c2 Mon Sep 17 00:00:00 2001 From: tth37 Date: Mon, 14 Apr 2025 01:49:05 +0800 Subject: [PATCH] feat: Backend for Self-Hosted/External Web Search/Loader Engines --- backend/open_webui/config.py | 23 ++++++++ backend/open_webui/main.py | 8 +++ .../open_webui/retrieval/loaders/external.py | 56 +++++++++++++++++++ backend/open_webui/retrieval/web/external.py | 45 +++++++++++++++ backend/open_webui/retrieval/web/utils.py | 8 +++ backend/open_webui/routers/retrieval.py | 9 +++ 6 files changed, 149 insertions(+) create mode 100644 backend/open_webui/retrieval/loaders/external.py create mode 100644 backend/open_webui/retrieval/web/external.py diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index bd822d06d..434b6403f 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2236,6 +2236,29 @@ FIRECRAWL_API_BASE_URL = PersistentConfig( os.environ.get("FIRECRAWL_API_BASE_URL", "https://api.firecrawl.dev"), ) +EXTERNAL_WEB_SEARCH_URL = PersistentConfig( + "EXTERNAL_WEB_SEARCH_URL", + "rag.web.search.external_web_search_url", + os.environ.get("EXTERNAL_WEB_SEARCH_URL", ""), +) + +EXTERNAL_WEB_SEARCH_API_KEY = PersistentConfig( + "EXTERNAL_WEB_SEARCH_API_KEY", + "rag.web.search.external_web_search_api_key", + os.environ.get("EXTERNAL_WEB_SEARCH_API_KEY", ""), +) + +EXTERNAL_WEB_LOADER_URL = PersistentConfig( + "EXTERNAL_WEB_LOADER_URL", + "rag.web.loader.external_web_loader_url", + os.environ.get("EXTERNAL_WEB_LOADER_URL", ""), +) + +EXTERNAL_WEB_LOADER_API_KEY = PersistentConfig( + "EXTERNAL_WEB_LOADER_API_KEY", + "rag.web.loader.external_web_loader_api_key", + os.environ.get("EXTERNAL_WEB_LOADER_API_KEY", ""), +) #################################### # Images diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 1d1efc5df..b18313f5c 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -245,6 +245,10 @@ from open_webui.config import ( ENABLE_GOOGLE_DRIVE_INTEGRATION, ENABLE_ONEDRIVE_INTEGRATION, UPLOAD_DIR, + EXTERNAL_WEB_SEARCH_URL, + EXTERNAL_WEB_SEARCH_API_KEY, + EXTERNAL_WEB_LOADER_URL, + EXTERNAL_WEB_LOADER_API_KEY, # WebUI WEBUI_AUTH, WEBUI_NAME, @@ -667,6 +671,10 @@ app.state.config.EXA_API_KEY = EXA_API_KEY app.state.config.PERPLEXITY_API_KEY = PERPLEXITY_API_KEY app.state.config.SOUGOU_API_SID = SOUGOU_API_SID app.state.config.SOUGOU_API_SK = SOUGOU_API_SK +app.state.config.EXTERNAL_WEB_SEARCH_URL = EXTERNAL_WEB_SEARCH_URL +app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = EXTERNAL_WEB_SEARCH_API_KEY +app.state.config.EXTERNAL_WEB_LOADER_URL = EXTERNAL_WEB_LOADER_URL +app.state.config.EXTERNAL_WEB_LOADER_API_KEY = EXTERNAL_WEB_LOADER_API_KEY app.state.config.PLAYWRIGHT_WS_URL = PLAYWRIGHT_WS_URL diff --git a/backend/open_webui/retrieval/loaders/external.py b/backend/open_webui/retrieval/loaders/external.py new file mode 100644 index 000000000..8ff2e617e --- /dev/null +++ b/backend/open_webui/retrieval/loaders/external.py @@ -0,0 +1,56 @@ +import requests +import logging +from typing import Iterator, List, Union + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +class ExternalLoader(BaseLoader): + def __init__( + self, + web_paths: Union[str, List[str]], + external_url: str, + external_api_key: str, + continue_on_failure: bool = True, + **kwargs, + ) -> None: + if not web_paths: + raise ValueError("At least one URL must be provided.") + + self.external_url = external_url + self.external_api_key = external_api_key + self.urls = web_paths if isinstance(web_paths, list) else [web_paths] + self.continue_on_failure = continue_on_failure + + def lazy_load(self) -> Iterator[Document]: + batch_size = 20 + for i in range(0, len(self.urls), batch_size): + urls = self.urls[i : i + batch_size] + try: + response = requests.get( + self.external_url, + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Authorization": f"Bearer {self.external_api_key}", + }, + params={ + "urls": urls, + } + ) + response.raise_for_status() + results = response.json() + for result in results: + yield Document( + page_content=result.get("page_content", ""), + metadata=result.get("metadata", {}), + ) + except Exception as e: + if self.continue_on_failure: + log.error(f"Error extracting content from batch {urls}: {e}") + else: + raise e diff --git a/backend/open_webui/retrieval/web/external.py b/backend/open_webui/retrieval/web/external.py new file mode 100644 index 000000000..29e914f84 --- /dev/null +++ b/backend/open_webui/retrieval/web/external.py @@ -0,0 +1,45 @@ +import logging +from typing import Optional, List + +import requests +from open_webui.retrieval.web.main import SearchResult, get_filtered_results +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +def search_external( + external_url: str, + external_api_key: str, + query: str, + count: int, + filter_list: Optional[List[str]] = None, +) -> List[SearchResult]: + try: + response = requests.get( + external_url, + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Authorization": f"Bearer {external_api_key}", + }, + params={ + "query": query, + "count": count, + } + ) + response.raise_for_status() + results = response.json() + if filter_list: + results = get_filtered_results(results, filter_list) + return [ + SearchResult( + link=result.get("link"), + title=result.get("title"), + snippet=result.get("snippet"), + ) + for result in results[:count] + ] + except Exception as e: + log.error(f"Error in External search: {e}") + return [] diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 718cfe52f..fc46d78c4 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -25,6 +25,7 @@ from langchain_community.document_loaders.firecrawl import FireCrawlLoader from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document from open_webui.retrieval.loaders.tavily import TavilyLoader +from open_webui.retrieval.loaders.external import ExternalLoader from open_webui.constants import ERROR_MESSAGES from open_webui.config import ( ENABLE_RAG_LOCAL_WEB_FETCH, @@ -35,6 +36,8 @@ from open_webui.config import ( FIRECRAWL_API_KEY, TAVILY_API_KEY, TAVILY_EXTRACT_DEPTH, + EXTERNAL_WEB_LOADER_URL, + EXTERNAL_WEB_LOADER_API_KEY, ) from open_webui.env import SRC_LOG_LEVELS @@ -619,6 +622,11 @@ def get_web_loader( web_loader_args["api_key"] = TAVILY_API_KEY.value web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value + if WEB_LOADER_ENGINE.value == "external": + WebLoaderClass = ExternalLoader + web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value + web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value + if WebLoaderClass: web_loader = WebLoaderClass(**web_loader_args) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 8f89351ac..72d10245d 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -61,6 +61,7 @@ from open_webui.retrieval.web.bing import search_bing from open_webui.retrieval.web.exa import search_exa from open_webui.retrieval.web.perplexity import search_perplexity from open_webui.retrieval.web.sougou import search_sougou +from open_webui.retrieval.web.external import search_external from open_webui.retrieval.utils import ( get_embedding_function, @@ -1465,6 +1466,14 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]: raise Exception( "No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables" ) + elif engine == "external": + return search_external( + request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + query, + request.app.state.config.WEB_SEARCH_RESULT_COUNT, + request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, + ) else: raise Exception("No search engine API key found in environment variables")