diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 1e265f2ce..88fde8663 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1865,6 +1865,12 @@ BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = PersistentConfig( os.getenv("BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL", "False").lower() == "true", ) +BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = PersistentConfig( + "BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE", + "rag.web.search.bypass_result_link_scrape", + os.getenv("BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE", "False").lower() == "true", +) + # You can provide a list of your own websites to filter after performing a web search. # This ensures the highest level of safety and reliability of the information sources. RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = PersistentConfig( diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 416460837..2cbdbb7f3 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -197,6 +197,7 @@ from open_webui.config import ( # Retrieval (Web Search) RAG_WEB_SEARCH_ENGINE, BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, + BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE, RAG_WEB_SEARCH_RESULT_COUNT, RAG_WEB_SEARCH_CONCURRENT_REQUESTS, RAG_WEB_SEARCH_TRUST_ENV, @@ -581,6 +582,9 @@ app.state.config.RAG_WEB_SEARCH_ENGINE = RAG_WEB_SEARCH_ENGINE app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = ( BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL ) +app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = ( + BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE +) app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = RAG_WEB_SEARCH_DOMAIN_FILTER_LIST app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ENABLE_GOOGLE_DRIVE_INTEGRATION diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index ac38c236e..b874bebd2 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -380,6 +380,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "web": { "ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, + "BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE": request.app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE, "search": { "enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH, "drive": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, @@ -477,6 +478,7 @@ class WebConfig(BaseModel): search: WebSearchConfig ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None + BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE: Optional[bool] = None class ConfigUpdateForm(BaseModel): @@ -571,6 +573,10 @@ async def update_rag_config( form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL ) + request.app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE = ( + form_data.web.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE + ) + request.app.state.config.SEARXNG_QUERY_URL = ( form_data.web.search.searxng_query_url ) @@ -1438,13 +1444,28 @@ async def process_web_search( ] urls = [result.link for result in web_results] - loader = get_web_loader( - urls, - verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, - requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, - trust_env=request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, - ) - docs = await loader.aload() + if request.app.state.config.BYPASS_WEB_SEARCH_RESULT_LINK_SCRAPE: + docs: List[Document] = [ + Document( + page_content=result.snippet, + metadata={ + "source": result.link, + "title": ( + result.title if result.title is not None else result.link + ), + }, + ) + for result in web_results + if result.snippet is not None and result.snippet != "" + ] + else: + loader = get_web_loader( + urls, + verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, + requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, + trust_env=request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, + ) + docs = await loader.aload() if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: return { diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index 0ae56cae3..3b9a2ba9e 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -470,6 +470,23 @@ +
+
+ + {$i18n.t('Bypass Scrape Links of Web Search Result')} + +
+
+ + + +
+
+
{$i18n.t('Trust Proxy Environment')}