Merge pull request #4974 from SearchApi/add-search-api

feat: Add support for SearchApi as alternative to WebSearch
This commit is contained in:
Timothy Jaeryang Baek
2024-08-30 19:42:36 +02:00
committed by GitHub
7 changed files with 504 additions and 2 deletions

View File

@@ -76,6 +76,7 @@ from apps.rag.search.serply import search_serply
from apps.rag.search.duckduckgo import search_duckduckgo
from apps.rag.search.tavily import search_tavily
from apps.rag.search.jina_search import search_jina
from apps.rag.search.searchapi import search_searchapi
from utils.misc import (
calculate_sha256,
@@ -128,6 +129,8 @@ from config import (
SERPER_API_KEY,
SERPLY_API_KEY,
TAVILY_API_KEY,
SEARCHAPI_API_KEY,
SEARCHAPI_ENGINE,
RAG_WEB_SEARCH_RESULT_COUNT,
RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
RAG_EMBEDDING_OPENAI_BATCH_SIZE,
@@ -189,6 +192,8 @@ app.state.config.SERPSTACK_HTTPS = SERPSTACK_HTTPS
app.state.config.SERPER_API_KEY = SERPER_API_KEY
app.state.config.SERPLY_API_KEY = SERPLY_API_KEY
app.state.config.TAVILY_API_KEY = TAVILY_API_KEY
app.state.config.SEARCHAPI_API_KEY = SEARCHAPI_API_KEY
app.state.config.SEARCHAPI_ENGINE = SEARCHAPI_ENGINE
app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = RAG_WEB_SEARCH_RESULT_COUNT
app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_REQUESTS
@@ -427,6 +432,8 @@ async def get_rag_config(user=Depends(get_admin_user)):
"serper_api_key": app.state.config.SERPER_API_KEY,
"serply_api_key": app.state.config.SERPLY_API_KEY,
"tavily_api_key": app.state.config.TAVILY_API_KEY,
"searchapi_api_key": app.state.config.SEARCHAPI_API_KEY,
"seaarchapi_engine": app.state.config.SEARCHAPI_ENGINE,
"result_count": app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
"concurrent_requests": app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
},
@@ -466,6 +473,8 @@ class WebSearchConfig(BaseModel):
serper_api_key: Optional[str] = None
serply_api_key: Optional[str] = None
tavily_api_key: Optional[str] = None
searchapi_api_key: Optional[str] = None
searchapi_engine: Optional[str] = None
result_count: Optional[int] = None
concurrent_requests: Optional[int] = None
@@ -529,6 +538,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
app.state.config.SERPER_API_KEY = form_data.web.search.serper_api_key
app.state.config.SERPLY_API_KEY = form_data.web.search.serply_api_key
app.state.config.TAVILY_API_KEY = form_data.web.search.tavily_api_key
app.state.config.SEARCHAPI_API_KEY = form_data.web.search.searchapi_api_key
app.state.config.SEARCHAPI_ENGINE = (
form_data.web.search.searchapi_engine
)
app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = form_data.web.search.result_count
app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = (
form_data.web.search.concurrent_requests
@@ -566,6 +579,8 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
"serpstack_https": app.state.config.SERPSTACK_HTTPS,
"serper_api_key": app.state.config.SERPER_API_KEY,
"serply_api_key": app.state.config.SERPLY_API_KEY,
"serachapi_api_key": app.state.config.SEARCHAPI_API_KEY,
"searchapi_engine": app.state.config.SEARCHAPI_ENGINE,
"tavily_api_key": app.state.config.TAVILY_API_KEY,
"result_count": app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
"concurrent_requests": app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
@@ -817,6 +832,7 @@ def search_web(engine: str, query: str) -> list[SearchResult]:
- SERPER_API_KEY
- SERPLY_API_KEY
- TAVILY_API_KEY
- SEARCHAPI_API_KEY + SEARCHAPI_ENGINE (by default `google`)
Args:
query (str): The query to search for
"""
@@ -904,6 +920,17 @@ def search_web(engine: str, query: str) -> list[SearchResult]:
)
else:
raise Exception("No TAVILY_API_KEY found in environment variables")
elif engine == "searchapi":
if app.state.config.SEARCHAPI_API_KEY:
return search_searchapi(
app.state.config.SEARCHAPI_API_KEY,
app.state.config.SEARCHAPI_ENGINE,
query,
app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
)
else:
raise Exception("No SEARCHAPI_API_KEY found in environment variables")
elif engine == "jina":
return search_jina(query, app.state.config.RAG_WEB_SEARCH_RESULT_COUNT)
else:

View File

@@ -8,7 +8,8 @@ def get_filtered_results(results, filter_list):
return results
filtered_results = []
for result in results:
domain = urlparse(result["url"]).netloc
url = result.get("url") or result.get("link", "")
domain = urlparse(url).netloc
if any(domain.endswith(filtered_domain) for filtered_domain in filter_list):
filtered_results.append(result)
return filtered_results

View File

@@ -0,0 +1,50 @@
import json
import logging
from typing import Optional
import requests
from urllib.parse import urlencode
from apps.rag.search.main import SearchResult, get_filtered_results
from config import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
def search_searchapi(
api_key: str, engine: str, query: str, count: int, filter_list: Optional[list[str]] = None
) -> list[SearchResult]:
"""Search using searchapi.io's API and return the results as a list of SearchResult objects.
Args:
api_key (str): A searchapi.io API key
query (str): The query to search for
"""
url = "https://www.searchapi.io/api/v1/search"
engine = engine or "google"
payload = {
"engine": engine,
"q": query,
"api_key": api_key
}
url = f"{url}?{urlencode(payload)}"
response = requests.request("GET", url)
json_response = response.json()
log.info(f"results from searchapi search: {json_response}")
results = sorted(
json_response.get("organic_results", []), key=lambda x: x.get("position", 0)
)
if filter_list:
results = get_filtered_results(results, filter_list)
return [
SearchResult(
link=result["link"],
title=result["title"],
snippet=result["snippet"]
)
for result in results[:count]
]

File diff suppressed because one or more lines are too long