From 0f73b9661653d13f52551a641e73e783e6cedc01 Mon Sep 17 00:00:00 2001 From: Stephen Smith Date: Sat, 26 Apr 2025 14:07:13 -0400 Subject: [PATCH] first pass at yacy support copied from searxng --- backend/open_webui/config.py | 6 ++ backend/open_webui/main.py | 2 + backend/open_webui/retrieval/web/yacy.py | 95 +++++++++++++++++++ backend/open_webui/routers/retrieval.py | 16 ++++ .../admin/Settings/WebSearch.svelte | 21 ++++ 5 files changed, 140 insertions(+) create mode 100644 backend/open_webui/retrieval/web/yacy.py diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 3b40977f2..6574f2855 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2087,6 +2087,12 @@ SEARXNG_QUERY_URL = PersistentConfig( os.getenv("SEARXNG_QUERY_URL", ""), ) +YACY_QUERY_URL = PersistentConfig( + "YACY_QUERY_URL", + "rag.web.search.yacy_query_url", + os.getenv("YACY_QUERY_URL", ""), +) + GOOGLE_PSE_API_KEY = PersistentConfig( "GOOGLE_PSE_API_KEY", "rag.web.search.google_pse_api_key", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 56ea17fa1..cb2194f47 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -219,6 +219,7 @@ from open_webui.config import ( SERPAPI_API_KEY, SERPAPI_ENGINE, SEARXNG_QUERY_URL, + YACY_QUERY_URL, SERPER_API_KEY, SERPLY_API_KEY, SERPSTACK_API_KEY, @@ -646,6 +647,7 @@ app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = ( app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ENABLE_GOOGLE_DRIVE_INTEGRATION app.state.config.ENABLE_ONEDRIVE_INTEGRATION = ENABLE_ONEDRIVE_INTEGRATION app.state.config.SEARXNG_QUERY_URL = SEARXNG_QUERY_URL +app.state.config.YACY_QUERY_URL = YACY_QUERY_URL app.state.config.GOOGLE_PSE_API_KEY = GOOGLE_PSE_API_KEY app.state.config.GOOGLE_PSE_ENGINE_ID = GOOGLE_PSE_ENGINE_ID app.state.config.BRAVE_SEARCH_API_KEY = BRAVE_SEARCH_API_KEY diff --git a/backend/open_webui/retrieval/web/yacy.py b/backend/open_webui/retrieval/web/yacy.py new file mode 100644 index 000000000..357c3face --- /dev/null +++ b/backend/open_webui/retrieval/web/yacy.py @@ -0,0 +1,95 @@ +import logging +from typing import Optional + +import requests +from requests.auth import HTTPDigestAuth +from open_webui.retrieval.web.main import SearchResult, get_filtered_results +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +def search_yacy( + query_url: str, + query: str, + count: int, + filter_list: Optional[list[str]] = None, + **kwargs, +) -> list[SearchResult]: + """ + Search a Yacy instance for a given query and return the results as a list of SearchResult objects. + + The function allows passing additional parameters such as language or time_range to tailor the search result. + + Args: + query_url (str): The base URL of the Yacy server. + query (str): The search term or question to find in the Yacy database. + count (int): The maximum number of results to retrieve from the search. + + Keyword Args: + language (str): Language filter for the search results; e.g., "en-US". Defaults to an empty string. + safesearch (int): Safe search filter for safer web results; 0 = off, 1 = moderate, 2 = strict. Defaults to 1 (moderate). + time_range (str): Time range for filtering results by date; e.g., "2023-04-05..today" or "all-time". Defaults to ''. + categories: (Optional[list[str]]): Specific categories within which the search should be performed, defaulting to an empty string if not provided. + + Returns: + list[SearchResult]: A list of SearchResults sorted by relevance score in descending order. + + Raise: + requests.exceptions.RequestException: If a request error occurs during the search process. + """ + + # Default values for optional parameters are provided as empty strings or None when not specified. + language = kwargs.get("language", "en-US") + safesearch = kwargs.get("safesearch", "1") + time_range = kwargs.get("time_range", "") + categories = "".join(kwargs.get("categories", [])) + + params = { + "query": query, + "resource": "global", + "nav": "all", + # "format": "json", + # "pageno": 1, + # "safesearch": safesearch, + # "language": language, + # "time_range": time_range, + # "categories": categories, + # "theme": "simple", + # "image_proxy": 0, + } + + # Legacy query format + if "" in query_url: + # Strip all query parameters from the URL + query_url = query_url.split("?")[0] + + log.debug(f"searching {query_url}") + + response = requests.get( + query_url, + auth=HTTPDigestAuth('admin', 'yacy'), + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Accept": "text/html", + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "en-US,en;q=0.5", + "Connection": "keep-alive", + }, + params=params, + ) + + response.raise_for_status() # Raise an exception for HTTP errors. + + json_response = response.json() + results = json_response.get("channels", [{}])[0].get("items", []) + sorted_results = sorted(results, key=lambda x: x.get("ranking", 0), reverse=True) + if filter_list: + sorted_results = get_filtered_results(sorted_results, filter_list) + return [ + SearchResult( + link=result["link"], title=result.get("title"), snippet=result.get("description") + ) + for result in sorted_results[:count] + ] diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 13f012483..9e18d35b2 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -53,6 +53,7 @@ from open_webui.retrieval.web.jina_search import search_jina from open_webui.retrieval.web.searchapi import search_searchapi from open_webui.retrieval.web.serpapi import search_serpapi from open_webui.retrieval.web.searxng import search_searxng +from open_webui.retrieval.web.yacy import search_yacy from open_webui.retrieval.web.serper import search_serper from open_webui.retrieval.web.serply import search_serply from open_webui.retrieval.web.serpstack import search_serpstack @@ -389,6 +390,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL, + "YACY_QUERY_URL": request.app.state.config.YACY_QUERY_URL, "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY, "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID, "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY, @@ -434,6 +436,7 @@ class WebConfig(BaseModel): WEB_SEARCH_DOMAIN_FILTER_LIST: Optional[List[str]] = [] BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None SEARXNG_QUERY_URL: Optional[str] = None + YACY_QUERY_URL: Optional[str] = None GOOGLE_PSE_API_KEY: Optional[str] = None GOOGLE_PSE_ENGINE_ID: Optional[str] = None BRAVE_SEARCH_API_KEY: Optional[str] = None @@ -651,6 +654,7 @@ async def update_rag_config( form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL ) request.app.state.config.SEARXNG_QUERY_URL = form_data.web.SEARXNG_QUERY_URL + request.app.state.config.YACY_QUERY_URL = form_data.web.YACY_QUERY_URL request.app.state.config.GOOGLE_PSE_API_KEY = form_data.web.GOOGLE_PSE_API_KEY request.app.state.config.GOOGLE_PSE_ENGINE_ID = ( form_data.web.GOOGLE_PSE_ENGINE_ID @@ -749,6 +753,7 @@ async def update_rag_config( "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL, + "YACY_QUERY_URL": request.app.state.config.YACY_QUERY_URL, "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY, "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID, "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY, @@ -1266,6 +1271,7 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]: """Search the web using a search engine and return the results as a list of SearchResult objects. Will look for a search engine API key in environment variables in the following order: - SEARXNG_QUERY_URL + - YACY_QUERY_URL - GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID - BRAVE_SEARCH_API_KEY - KAGI_SEARCH_API_KEY @@ -1295,6 +1301,16 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]: ) else: raise Exception("No SEARXNG_QUERY_URL found in environment variables") + elif engine == "yacy": + if request.app.state.config.YACY_QUERY_URL: + return search_yacy( + request.app.state.config.YACY_QUERY_URL, + query, + request.app.state.config.WEB_SEARCH_RESULT_COUNT, + request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, + ) + else: + raise Exception("No YACY_QUERY_URL found in environment variables") elif engine == "google_pse": if ( request.app.state.config.GOOGLE_PSE_API_KEY diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index d9771f835..d5ba2f079 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -14,6 +14,7 @@ let webSearchEngines = [ 'searxng', + 'yacy', 'google_pse', 'brave', 'kagi', @@ -143,6 +144,26 @@ + {:else if webConfig.WEB_SEARCH_ENGINE === 'yacy'} +
+
+
+ {$i18n.t('Yacy Query URL')} +
+ +
+
+ +
+
+
+
{:else if webConfig.WEB_SEARCH_ENGINE === 'google_pse'}