feat: prototype frontend web search integration

This commit is contained in:
Jun Siang Cheah
2024-05-11 23:12:52 +08:00
parent 619c2f9c71
commit 2660a6e5b8
11 changed files with 305 additions and 18 deletions

View File

@@ -93,6 +93,7 @@ from config import (
CHUNK_OVERLAP,
RAG_TEMPLATE,
ENABLE_RAG_LOCAL_WEB_FETCH,
RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
)
from constants import ERROR_MESSAGES
@@ -538,18 +539,23 @@ def store_web(form_data: UrlForm, user=Depends(get_current_user)):
detail=ERROR_MESSAGES.DEFAULT(e),
)
def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True):
# Check if the URL is valid
if not validate_url(url):
raise ValueError(ERROR_MESSAGES.INVALID_URL)
return WebBaseLoader(url, verify_ssl=verify_ssl)
return WebBaseLoader(
url,
verify_ssl=verify_ssl,
requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
)
def validate_url(url: Union[str, Sequence[str]]):
if isinstance(url, str):
if isinstance(validators.url(url), validators.ValidationError):
raise ValueError(ERROR_MESSAGES.INVALID_URL)
if not ENABLE_LOCAL_WEB_FETCH:
if not ENABLE_RAG_LOCAL_WEB_FETCH:
# Local web fetch is disabled, filter out any URLs that resolve to private IP addresses
parsed_url = urllib.parse.urlparse(url)
# Get IPv4 and IPv6 addresses
@@ -593,7 +599,7 @@ def store_websearch(form_data: SearchForm, user=Depends(get_current_user)):
)
urls = [result.link for result in web_results]
loader = get_web_loader(urls)
data = loader.load()
data = loader.aload()
collection_name = form_data.collection_name
if collection_name == "":

View File

@@ -3,7 +3,7 @@ import logging
import requests
from apps.rag.search.main import SearchResult
from config import SRC_LOG_LEVELS, WEB_SEARCH_RESULT_COUNT
from config import SRC_LOG_LEVELS, RAG_WEB_SEARCH_RESULT_COUNT
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -22,7 +22,7 @@ def search_brave(api_key: str, query: str) -> list[SearchResult]:
"Accept-Encoding": "gzip",
"X-Subscription-Token": api_key,
}
params = {"q": query, "count": WEB_SEARCH_RESULT_COUNT}
params = {"q": query, "count": RAG_WEB_SEARCH_RESULT_COUNT}
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
@@ -33,5 +33,5 @@ def search_brave(api_key: str, query: str) -> list[SearchResult]:
SearchResult(
link=result["url"], title=result.get("title"), snippet=result.get("snippet")
)
for result in results[:WEB_SEARCH_RESULT_COUNT]
for result in results[:RAG_WEB_SEARCH_RESULT_COUNT]
]

View File

@@ -4,7 +4,7 @@ import logging
import requests
from apps.rag.search.main import SearchResult
from config import SRC_LOG_LEVELS, WEB_SEARCH_RESULT_COUNT
from config import SRC_LOG_LEVELS, RAG_WEB_SEARCH_RESULT_COUNT
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -27,7 +27,7 @@ def search_google_pse(
"cx": search_engine_id,
"q": query,
"key": api_key,
"num": WEB_SEARCH_RESULT_COUNT,
"num": RAG_WEB_SEARCH_RESULT_COUNT,
}
response = requests.request("GET", url, headers=headers, params=params)

View File

@@ -3,7 +3,7 @@ import logging
import requests
from apps.rag.search.main import SearchResult
from config import SRC_LOG_LEVELS, WEB_SEARCH_RESULT_COUNT
from config import SRC_LOG_LEVELS, RAG_WEB_SEARCH_RESULT_COUNT
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -40,5 +40,5 @@ def search_searxng(query_url: str, query: str) -> list[SearchResult]:
SearchResult(
link=result["url"], title=result.get("title"), snippet=result.get("content")
)
for result in sorted_results[:WEB_SEARCH_RESULT_COUNT]
for result in sorted_results[:RAG_WEB_SEARCH_RESULT_COUNT]
]

View File

@@ -4,7 +4,7 @@ import logging
import requests
from apps.rag.search.main import SearchResult
from config import SRC_LOG_LEVELS, WEB_SEARCH_RESULT_COUNT
from config import SRC_LOG_LEVELS, RAG_WEB_SEARCH_RESULT_COUNT
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -35,5 +35,5 @@ def search_serper(api_key: str, query: str) -> list[SearchResult]:
title=result.get("title"),
snippet=result.get("description"),
)
for result in results[:WEB_SEARCH_RESULT_COUNT]
for result in results[:RAG_WEB_SEARCH_RESULT_COUNT]
]

View File

@@ -4,7 +4,7 @@ import logging
import requests
from apps.rag.search.main import SearchResult
from config import SRC_LOG_LEVELS, WEB_SEARCH_RESULT_COUNT
from config import SRC_LOG_LEVELS, RAG_WEB_SEARCH_RESULT_COUNT
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -39,5 +39,5 @@ def search_serpstack(
SearchResult(
link=result["url"], title=result.get("title"), snippet=result.get("snippet")
)
for result in results[:WEB_SEARCH_RESULT_COUNT]
for result in results[:RAG_WEB_SEARCH_RESULT_COUNT]
]

View File

@@ -549,7 +549,10 @@ BRAVE_SEARCH_API_KEY = os.getenv("BRAVE_SEARCH_API_KEY", "")
SERPSTACK_API_KEY = os.getenv("SERPSTACK_API_KEY", "")
SERPSTACK_HTTPS = os.getenv("SERPSTACK_HTTPS", "True").lower() == "true"
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
WEB_SEARCH_RESULT_COUNT = int(os.getenv("WEB_SEARCH_RESULT_COUNT", "10"))
RAG_WEB_SEARCH_RESULT_COUNT = int(os.getenv("RAG_WEB_SEARCH_RESULT_COUNT", "10"))
RAG_WEB_SEARCH_CONCURRENT_REQUESTS = int(
os.getenv("RAG_WEB_SEARCH_CONCURRENT_REQUESTS", "10")
)
####################################
# Transcribe