diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 0ac92bd23..a99f0ef92 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1970,6 +1970,12 @@ BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = PersistentConfig( os.getenv("BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL", "False").lower() == "true", ) +BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = PersistentConfig( + "BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE", + "rag.web.search.bypass_result_link_scrape", + os.getenv("BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE", "False").lower() == "true", +) + # You can provide a list of your own websites to filter after performing a web search. # This ensures the highest level of safety and reliability of the information sources. RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = PersistentConfig( diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index bb78d9003..fc02780de 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -201,6 +201,7 @@ from open_webui.config import ( # Retrieval (Web Search) RAG_WEB_SEARCH_ENGINE, BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, + BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE, RAG_WEB_SEARCH_RESULT_COUNT, RAG_WEB_SEARCH_CONCURRENT_REQUESTS, RAG_WEB_SEARCH_TRUST_ENV, @@ -612,6 +613,9 @@ app.state.config.RAG_WEB_SEARCH_ENGINE = RAG_WEB_SEARCH_ENGINE app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = ( BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL ) +app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = ( + BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE +) app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = RAG_WEB_SEARCH_DOMAIN_FILTER_LIST app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ENABLE_GOOGLE_DRIVE_INTEGRATION diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 2bd908606..623d99d69 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -382,6 +382,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "web": { "ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, + "BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE": request.app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE, "search": { "enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH, "drive": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, @@ -480,6 +481,7 @@ class WebConfig(BaseModel): search: WebSearchConfig ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None + BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE: Optional[bool] = None class ConfigUpdateForm(BaseModel): @@ -577,6 +579,10 @@ async def update_rag_config( form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL ) + request.app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = ( + form_data.web.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE + ) + request.app.state.config.SEARXNG_QUERY_URL = ( form_data.web.search.searxng_query_url ) @@ -1452,13 +1458,28 @@ async def process_web_search( ] urls = [result.link for result in web_results] - loader = get_web_loader( - urls, - verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, - requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, - trust_env=request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, - ) - docs = await loader.aload() + if request.app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE: + docs: List[Document] = [ + Document( + page_content=result.snippet, + metadata={ + "source": result.link, + "title": ( + result.title if result.title is not None else result.link + ), + }, + ) + for result in web_results + if result.snippet is not None and result.snippet != "" + ] + else: + loader = get_web_loader( + urls, + verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, + requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, + trust_env=request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, + ) + docs = await loader.aload() if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: return { diff --git a/backend/open_webui/test/apps/webui/routers/test_retrieval.py b/backend/open_webui/test/apps/webui/routers/test_retrieval.py new file mode 100644 index 000000000..a8ce3b329 --- /dev/null +++ b/backend/open_webui/test/apps/webui/routers/test_retrieval.py @@ -0,0 +1,115 @@ +from test.util.abstract_integration_test import AbstractPostgresTest +from test.util.mock_user import mock_webui_user +from unittest.mock import patch, MagicMock + +from langchain_core.documents import Document + + +class TestRetrieval(AbstractPostgresTest): + BASE_PATH = "/api/v1/retrieval" + + def setup_class(cls): + super().setup_class() + from open_webui.retrieval.web.main import SearchResult + + cls.searchresult = SearchResult + + @patch("open_webui.routers.retrieval.search_web") + @patch("open_webui.routers.retrieval.get_web_loader") + @patch("open_webui.routers.retrieval.get_config") + @patch("open_webui.routers.retrieval.run_in_threadpool") + def test_process_web_search_bypass_scrape( + self, + mock_run_in_threadpool, + mock_get_config, + mock_get_web_loader, + mock_search_web, + ): + # Setup mocks + mock_search_results = [ + self.searchresult( + link="https://example.com/1", + title="Example 1", + snippet="Example snippet 1", + ), + self.searchresult( + link="https://example.com/2", title=None, snippet="Example snippet 2" + ), + ] + mock_search_web.return_value = mock_search_results + + mock_config = MagicMock() + mock_config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = True + mock_get_config.return_value = mock_config + + mock_run_in_threadpool.return_value = True + + # Execute function + with mock_webui_user(id="2"): + response = self.fast_api_client.post( + self.create_url("/process/web/search"), + json={ + "query": "test query", + "collection_name": "test_collection", + }, + ) + + # Assertions + assert response.status_code == 200 + result = response.json() + assert not mock_get_web_loader.called + assert result["status"] is True + assert result["collection_name"] == "test_collection" + assert result["loaded_count"] == 2 + assert result["docs"][0].page_content == "Example snippet 1" + assert result["docs"][1].metadata["title"] == "https://example.com/2" + + @patch("open_webui.routers.retrieval.search_web") + @patch("open_webui.routers.retrieval.get_web_loader") + @patch("open_webui.routers.retrieval.get_config") + @patch("open_webui.routers.retrieval.run_in_threadpool") + def test_process_web_search_with_scrape( + self, + mock_run_in_threadpool, + mock_get_config, + mock_get_web_loader, + mock_search_web, + ): + # Setup mocks + mock_search_results = [ + self.searchresult( + link="https://example.com/1", + title="Example 1", + snippet="Example snippet 1", + ), + ] + mock_search_web.return_value = mock_search_results + + mock_config = MagicMock() + mock_config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = False + mock_get_config.return_value = mock_config + + mock_loader = MagicMock() + mock_loader.load.return_value = [Document(page_content="Web page content")] + mock_get_web_loader.return_value = mock_loader + + mock_run_in_threadpool.return_value = True + + # Execute function + with mock_webui_user(id="2"): + response = self.fast_api_client.post( + self.create_url("/process/web/search"), + json={ + "query": "test query", + "collection_name": "test_collection", + }, + ) + + # Assertions + assert response.status_code == 200 + result = response.json() + assert mock_get_web_loader.called + assert result["status"] is True + assert result["collection_name"] == "test_collection" + assert result["loaded_count"] == 1 + assert result["docs"][0].page_content == "Web page content" diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index cec4cabe6..b565a9628 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -474,6 +474,23 @@ +
+
+ + {$i18n.t('Bypass Scrape Links of Web Search Result')} + +
+
+ + + +
+
+
{$i18n.t('Trust Proxy Environment')}