feat: merge with main

This commit is contained in:
Fabio Polito
2025-03-05 22:04:34 +00:00
372 changed files with 26027 additions and 10944 deletions

View File

@@ -4,6 +4,7 @@ import ftfy
import sys
from langchain_community.document_loaders import (
AzureAIDocumentIntelligenceLoader,
BSHTMLLoader,
CSVLoader,
Docx2txtLoader,
@@ -76,6 +77,7 @@ known_source_ext = [
"jsx",
"hs",
"lhs",
"json",
]
@@ -221,12 +223,33 @@ class Loader:
file_path=file_path,
mime_type=file_content_type,
)
elif self.engine == "docling":
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
loader = DoclingLoader(
url=self.kwargs.get("DOCLING_SERVER_URL"),
file_path=file_path,
mime_type=file_content_type,
)
elif (
self.engine == "document_intelligence"
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
and (
file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
or file_content_type
in [
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]
)
):
loader = AzureAIDocumentIntelligenceLoader(
file_path=file_path,
api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
)
else:
if file_ext == "pdf":
loader = PyPDFLoader(

View File

@@ -1,13 +1,19 @@
import os
import logging
import torch
import numpy as np
from colbert.infra import ColBERTConfig
from colbert.modeling.checkpoint import Checkpoint
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class ColBERT:
def __init__(self, name, **kwargs) -> None:
print("ColBERT: Loading model", name)
log.info("ColBERT: Loading model", name)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
DOCKER = kwargs.get("env") == "docker"

View File

@@ -5,6 +5,7 @@ from typing import Optional, Union
import asyncio
import requests
import hashlib
from huggingface_hub import snapshot_download
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
@@ -14,9 +15,16 @@ from langchain_core.documents import Document
from open_webui.config import VECTOR_DB
from open_webui.retrieval.vector.connector import VECTOR_DB_CLIENT
from open_webui.utils.misc import get_last_user_message
from open_webui.utils.misc import get_last_user_message, calculate_sha256_string
from open_webui.env import SRC_LOG_LEVELS, OFFLINE_MODE
from open_webui.models.users import UserModel
from open_webui.models.files import Files
from open_webui.env import (
SRC_LOG_LEVELS,
OFFLINE_MODE,
ENABLE_FORWARD_USER_INFO_HEADERS,
)
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -61,9 +69,7 @@ class VectorSearchRetriever(BaseRetriever):
def query_doc(
collection_name: str,
query_embedding: list[float],
k: int,
collection_name: str, query_embedding: list[float], k: int, user: UserModel = None
):
try:
result = VECTOR_DB_CLIENT.search(
@@ -77,7 +83,20 @@ def query_doc(
return result
except Exception as e:
print(e)
log.exception(f"Error querying doc {collection_name} with limit {k}: {e}")
raise e
def get_doc(collection_name: str, user: UserModel = None):
try:
result = VECTOR_DB_CLIENT.get(collection_name=collection_name)
if result:
log.info(f"query_doc:result {result.ids} {result.metadatas}")
return result
except Exception as e:
log.exception(f"Error getting doc {collection_name}: {e}")
raise e
@@ -134,47 +153,80 @@ def query_doc_with_hybrid_search(
raise e
def merge_and_sort_query_results(
query_results: list[dict], k: int, reverse: bool = False
) -> list[dict]:
def merge_get_results(get_results: list[dict]) -> dict:
# Initialize lists to store combined data
combined_distances = []
combined_documents = []
combined_metadatas = []
combined_ids = []
for data in query_results:
combined_distances.extend(data["distances"][0])
for data in get_results:
combined_documents.extend(data["documents"][0])
combined_metadatas.extend(data["metadatas"][0])
combined_ids.extend(data["ids"][0])
# Create a list of tuples (distance, document, metadata)
combined = list(zip(combined_distances, combined_documents, combined_metadatas))
# Create the output dictionary
result = {
"documents": [combined_documents],
"metadatas": [combined_metadatas],
"ids": [combined_ids],
}
return result
def merge_and_sort_query_results(
query_results: list[dict], k: int, reverse: bool = False
) -> dict:
# Initialize lists to store combined data
combined = []
seen_hashes = set() # To store unique document hashes
for data in query_results:
distances = data["distances"][0]
documents = data["documents"][0]
metadatas = data["metadatas"][0]
for distance, document, metadata in zip(distances, documents, metadatas):
if isinstance(document, str):
doc_hash = hashlib.md5(
document.encode()
).hexdigest() # Compute a hash for uniqueness
if doc_hash not in seen_hashes:
seen_hashes.add(doc_hash)
combined.append((distance, document, metadata))
# Sort the list based on distances
combined.sort(key=lambda x: x[0], reverse=reverse)
# We don't have anything :-(
if not combined:
sorted_distances = []
sorted_documents = []
sorted_metadatas = []
else:
# Unzip the sorted list
sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
# Slice to keep only the top k elements
sorted_distances, sorted_documents, sorted_metadatas = (
zip(*combined[:k]) if combined else ([], [], [])
)
# Slicing the lists to include only k elements
sorted_distances = list(sorted_distances)[:k]
sorted_documents = list(sorted_documents)[:k]
sorted_metadatas = list(sorted_metadatas)[:k]
# Create the output dictionary
result = {
"distances": [sorted_distances],
"documents": [sorted_documents],
"metadatas": [sorted_metadatas],
# Create and return the output dictionary
return {
"distances": [list(sorted_distances)],
"documents": [list(sorted_documents)],
"metadatas": [list(sorted_metadatas)],
}
return result
def get_all_items_from_collections(collection_names: list[str]) -> dict:
results = []
for collection_name in collection_names:
if collection_name:
try:
result = get_doc(collection_name=collection_name)
if result is not None:
results.append(result.model_dump())
except Exception as e:
log.exception(f"Error when querying the collection: {e}")
else:
pass
return merge_get_results(results)
def query_collection(
@@ -259,29 +311,35 @@ def get_embedding_function(
embedding_batch_size,
):
if embedding_engine == "":
return lambda query: embedding_function.encode(query).tolist()
return lambda query, user=None: embedding_function.encode(query).tolist()
elif embedding_engine in ["ollama", "openai"]:
func = lambda query: generate_embeddings(
func = lambda query, user=None: generate_embeddings(
engine=embedding_engine,
model=embedding_model,
text=query,
url=url,
key=key,
user=user,
)
def generate_multiple(query, func):
def generate_multiple(query, user, func):
if isinstance(query, list):
embeddings = []
for i in range(0, len(query), embedding_batch_size):
embeddings.extend(func(query[i : i + embedding_batch_size]))
embeddings.extend(
func(query[i : i + embedding_batch_size], user=user)
)
return embeddings
else:
return func(query)
return func(query, user)
return lambda query: generate_multiple(query, func)
return lambda query, user=None: generate_multiple(query, user, func)
else:
raise ValueError(f"Unknown embedding engine: {embedding_engine}")
def get_sources_from_files(
request,
files,
queries,
embedding_function,
@@ -289,21 +347,81 @@ def get_sources_from_files(
reranking_function,
r,
hybrid_search,
full_context=False,
):
log.debug(f"files: {files} {queries} {embedding_function} {reranking_function}")
log.debug(
f"files: {files} {queries} {embedding_function} {reranking_function} {full_context}"
)
extracted_collections = []
relevant_contexts = []
for file in files:
if file.get("context") == "full":
context = None
if file.get("docs"):
# BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
context = {
"documents": [[doc.get("content") for doc in file.get("docs")]],
"metadatas": [[doc.get("metadata") for doc in file.get("docs")]],
}
elif file.get("context") == "full":
# Manual Full Mode Toggle
context = {
"documents": [[file.get("file").get("data", {}).get("content")]],
"metadatas": [[{"file_id": file.get("id"), "name": file.get("name")}]],
}
else:
context = None
elif (
file.get("type") != "web_search"
and request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
):
# BYPASS_EMBEDDING_AND_RETRIEVAL
if file.get("type") == "collection":
file_ids = file.get("data", {}).get("file_ids", [])
documents = []
metadatas = []
for file_id in file_ids:
file_object = Files.get_file_by_id(file_id)
if file_object:
documents.append(file_object.data.get("content", ""))
metadatas.append(
{
"file_id": file_id,
"name": file_object.filename,
"source": file_object.filename,
}
)
context = {
"documents": [documents],
"metadatas": [metadatas],
}
elif file.get("id"):
file_object = Files.get_file_by_id(file.get("id"))
if file_object:
context = {
"documents": [[file_object.data.get("content", "")]],
"metadatas": [
[
{
"file_id": file.get("id"),
"name": file_object.filename,
"source": file_object.filename,
}
]
],
}
elif file.get("file").get("data"):
context = {
"documents": [[file.get("file").get("data", {}).get("content")]],
"metadatas": [
[file.get("file").get("data", {}).get("metadata", {})]
],
}
else:
collection_names = []
if file.get("type") == "collection":
if file.get("legacy"):
@@ -323,42 +441,50 @@ def get_sources_from_files(
log.debug(f"skipping {file} as it has already been extracted")
continue
try:
context = None
if file.get("type") == "text":
context = file["content"]
else:
if hybrid_search:
try:
context = query_collection_with_hybrid_search(
if full_context:
try:
context = get_all_items_from_collections(collection_names)
except Exception as e:
log.exception(e)
else:
try:
context = None
if file.get("type") == "text":
context = file["content"]
else:
if hybrid_search:
try:
context = query_collection_with_hybrid_search(
collection_names=collection_names,
queries=queries,
embedding_function=embedding_function,
k=k,
reranking_function=reranking_function,
r=r,
)
except Exception as e:
log.debug(
"Error when using hybrid search, using"
" non hybrid search as fallback."
)
if (not hybrid_search) or (context is None):
context = query_collection(
collection_names=collection_names,
queries=queries,
embedding_function=embedding_function,
k=k,
reranking_function=reranking_function,
r=r,
)
except Exception as e:
log.debug(
"Error when using hybrid search, using"
" non hybrid search as fallback."
)
if (not hybrid_search) or (context is None):
context = query_collection(
collection_names=collection_names,
queries=queries,
embedding_function=embedding_function,
k=k,
)
except Exception as e:
log.exception(e)
except Exception as e:
log.exception(e)
extracted_collections.extend(collection_names)
if context:
if "data" in file:
del file["data"]
relevant_contexts.append({**context, "file": file})
sources = []
@@ -423,7 +549,11 @@ def get_model_path(model: str, update_model: bool = False):
def generate_openai_batch_embeddings(
model: str, texts: list[str], url: str = "https://api.openai.com/v1", key: str = ""
model: str,
texts: list[str],
url: str = "https://api.openai.com/v1",
key: str = "",
user: UserModel = None,
) -> Optional[list[list[float]]]:
try:
r = requests.post(
@@ -431,6 +561,16 @@ def generate_openai_batch_embeddings(
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {key}",
**(
{
"X-OpenWebUI-User-Name": user.name,
"X-OpenWebUI-User-Id": user.id,
"X-OpenWebUI-User-Email": user.email,
"X-OpenWebUI-User-Role": user.role,
}
if ENABLE_FORWARD_USER_INFO_HEADERS and user
else {}
),
},
json={"input": texts, "model": model},
)
@@ -441,12 +581,12 @@ def generate_openai_batch_embeddings(
else:
raise "Something went wrong :/"
except Exception as e:
print(e)
log.exception(f"Error generating openai batch embeddings: {e}")
return None
def generate_ollama_batch_embeddings(
model: str, texts: list[str], url: str, key: str = ""
model: str, texts: list[str], url: str, key: str = "", user: UserModel = None
) -> Optional[list[list[float]]]:
try:
r = requests.post(
@@ -454,6 +594,16 @@ def generate_ollama_batch_embeddings(
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {key}",
**(
{
"X-OpenWebUI-User-Name": user.name,
"X-OpenWebUI-User-Id": user.id,
"X-OpenWebUI-User-Email": user.email,
"X-OpenWebUI-User-Role": user.role,
}
if ENABLE_FORWARD_USER_INFO_HEADERS
else {}
),
},
json={"input": texts, "model": model},
)
@@ -465,29 +615,36 @@ def generate_ollama_batch_embeddings(
else:
raise "Something went wrong :/"
except Exception as e:
print(e)
log.exception(f"Error generating ollama batch embeddings: {e}")
return None
def generate_embeddings(engine: str, model: str, text: Union[str, list[str]], **kwargs):
url = kwargs.get("url", "")
key = kwargs.get("key", "")
user = kwargs.get("user")
if engine == "ollama":
if isinstance(text, list):
embeddings = generate_ollama_batch_embeddings(
**{"model": model, "texts": text, "url": url, "key": key}
**{"model": model, "texts": text, "url": url, "key": key, "user": user}
)
else:
embeddings = generate_ollama_batch_embeddings(
**{"model": model, "texts": [text], "url": url, "key": key}
**{
"model": model,
"texts": [text],
"url": url,
"key": key,
"user": user,
}
)
return embeddings[0] if isinstance(text, str) else embeddings
elif engine == "openai":
if isinstance(text, list):
embeddings = generate_openai_batch_embeddings(model, text, url, key)
embeddings = generate_openai_batch_embeddings(model, text, url, key, user)
else:
embeddings = generate_openai_batch_embeddings(model, [text], url, key)
embeddings = generate_openai_batch_embeddings(model, [text], url, key, user)
return embeddings[0] if isinstance(text, str) else embeddings

View File

@@ -16,6 +16,10 @@ elif VECTOR_DB == "pgvector":
from open_webui.retrieval.vector.dbs.pgvector import PgvectorClient
VECTOR_DB_CLIENT = PgvectorClient()
elif VECTOR_DB == "elasticsearch":
from open_webui.retrieval.vector.dbs.elasticsearch import ElasticsearchClient
VECTOR_DB_CLIENT = ElasticsearchClient()
else:
from open_webui.retrieval.vector.dbs.chroma import ChromaClient

8
backend/open_webui/retrieval/vector/dbs/chroma.py Normal file → Executable file
View File

@@ -1,4 +1,5 @@
import chromadb
import logging
from chromadb import Settings
from chromadb.utils.batch_utils import create_batches
@@ -16,6 +17,10 @@ from open_webui.config import (
CHROMA_CLIENT_AUTH_PROVIDER,
CHROMA_CLIENT_AUTH_CREDENTIALS,
)
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class ChromaClient:
@@ -102,8 +107,7 @@ class ChromaClient:
}
)
return None
except Exception as e:
print(e)
except:
return None
def get(self, collection_name: str) -> Optional[GetResult]:

View File

@@ -0,0 +1,274 @@
from elasticsearch import Elasticsearch, BadRequestError
from typing import Optional
import ssl
from elasticsearch.helpers import bulk, scan
from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
from open_webui.config import (
ELASTICSEARCH_URL,
ELASTICSEARCH_CA_CERTS,
ELASTICSEARCH_API_KEY,
ELASTICSEARCH_USERNAME,
ELASTICSEARCH_PASSWORD,
ELASTICSEARCH_CLOUD_ID,
SSL_ASSERT_FINGERPRINT,
)
class ElasticsearchClient:
"""
Important:
in order to reduce the number of indexes and since the embedding vector length is fixed, we avoid creating
an index for each file but store it as a text field, while seperating to different index
baesd on the embedding length.
"""
def __init__(self):
self.index_prefix = "open_webui_collections"
self.client = Elasticsearch(
hosts=[ELASTICSEARCH_URL],
ca_certs=ELASTICSEARCH_CA_CERTS,
api_key=ELASTICSEARCH_API_KEY,
cloud_id=ELASTICSEARCH_CLOUD_ID,
basic_auth=(
(ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD)
if ELASTICSEARCH_USERNAME and ELASTICSEARCH_PASSWORD
else None
),
ssl_assert_fingerprint=SSL_ASSERT_FINGERPRINT,
)
# Status: works
def _get_index_name(self, dimension: int) -> str:
return f"{self.index_prefix}_d{str(dimension)}"
# Status: works
def _scan_result_to_get_result(self, result) -> GetResult:
if not result:
return None
ids = []
documents = []
metadatas = []
for hit in result:
ids.append(hit["_id"])
documents.append(hit["_source"].get("text"))
metadatas.append(hit["_source"].get("metadata"))
return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas])
# Status: works
def _result_to_get_result(self, result) -> GetResult:
if not result["hits"]["hits"]:
return None
ids = []
documents = []
metadatas = []
for hit in result["hits"]["hits"]:
ids.append(hit["_id"])
documents.append(hit["_source"].get("text"))
metadatas.append(hit["_source"].get("metadata"))
return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas])
# Status: works
def _result_to_search_result(self, result) -> SearchResult:
ids = []
distances = []
documents = []
metadatas = []
for hit in result["hits"]["hits"]:
ids.append(hit["_id"])
distances.append(hit["_score"])
documents.append(hit["_source"].get("text"))
metadatas.append(hit["_source"].get("metadata"))
return SearchResult(
ids=[ids],
distances=[distances],
documents=[documents],
metadatas=[metadatas],
)
# Status: works
def _create_index(self, dimension: int):
body = {
"mappings": {
"properties": {
"collection": {"type": "keyword"},
"id": {"type": "keyword"},
"vector": {
"type": "dense_vector",
"dims": dimension, # Adjust based on your vector dimensions
"index": True,
"similarity": "cosine",
},
"text": {"type": "text"},
"metadata": {"type": "object"},
}
}
}
self.client.indices.create(index=self._get_index_name(dimension), body=body)
# Status: works
def _create_batches(self, items: list[VectorItem], batch_size=100):
for i in range(0, len(items), batch_size):
yield items[i : min(i + batch_size, len(items))]
# Status: works
def has_collection(self, collection_name) -> bool:
query_body = {"query": {"bool": {"filter": []}}}
query_body["query"]["bool"]["filter"].append(
{"term": {"collection": collection_name}}
)
try:
result = self.client.count(index=f"{self.index_prefix}*", body=query_body)
return result.body["count"] > 0
except Exception as e:
return None
# @TODO: Make this delete a collection and not an index
def delete_colleciton(self, collection_name: str):
# TODO: fix this to include the dimension or a * prefix
# delete_collection here means delete a bunch of documents for an index.
# We are simply adapting to the norms of the other DBs.
self.client.indices.delete(index=self._get_collection_name(collection_name))
# Status: works
def search(
self, collection_name: str, vectors: list[list[float]], limit: int
) -> Optional[SearchResult]:
query = {
"size": limit,
"_source": ["text", "metadata"],
"query": {
"script_score": {
"query": {
"bool": {"filter": [{"term": {"collection": collection_name}}]}
},
"script": {
"source": "cosineSimilarity(params.vector, 'vector') + 1.0",
"params": {
"vector": vectors[0]
}, # Assuming single query vector
},
}
},
}
result = self.client.search(
index=self._get_index_name(len(vectors[0])), body=query
)
return self._result_to_search_result(result)
# Status: only tested halfwat
def query(
self, collection_name: str, filter: dict, limit: Optional[int] = None
) -> Optional[GetResult]:
if not self.has_collection(collection_name):
return None
query_body = {
"query": {"bool": {"filter": []}},
"_source": ["text", "metadata"],
}
for field, value in filter.items():
query_body["query"]["bool"]["filter"].append({"term": {field: value}})
query_body["query"]["bool"]["filter"].append(
{"term": {"collection": collection_name}}
)
size = limit if limit else 10
try:
result = self.client.search(
index=f"{self.index_prefix}*",
body=query_body,
size=size,
)
return self._result_to_get_result(result)
except Exception as e:
return None
# Status: works
def _has_index(self, dimension: int):
return self.client.indices.exists(
index=self._get_index_name(dimension=dimension)
)
def get_or_create_index(self, dimension: int):
if not self._has_index(dimension=dimension):
self._create_index(dimension=dimension)
# Status: works
def get(self, collection_name: str) -> Optional[GetResult]:
# Get all the items in the collection.
query = {
"query": {"bool": {"filter": [{"term": {"collection": collection_name}}]}},
"_source": ["text", "metadata"],
}
results = list(scan(self.client, index=f"{self.index_prefix}*", query=query))
return self._scan_result_to_get_result(results)
# Status: works
def insert(self, collection_name: str, items: list[VectorItem]):
if not self._has_index(dimension=len(items[0]["vector"])):
self._create_index(dimension=len(items[0]["vector"]))
for batch in self._create_batches(items):
actions = [
{
"_index": self._get_index_name(dimension=len(items[0]["vector"])),
"_id": item["id"],
"_source": {
"collection": collection_name,
"vector": item["vector"],
"text": item["text"],
"metadata": item["metadata"],
},
}
for item in batch
]
bulk(self.client, actions)
# Status: should work
def upsert(self, collection_name: str, items: list[VectorItem]):
if not self._has_index(dimension=len(items[0]["vector"])):
self._create_index(collection_name, dimension=len(items[0]["vector"]))
for batch in self._create_batches(items):
actions = [
{
"_index": self._get_index_name(dimension=len(items[0]["vector"])),
"_id": item["id"],
"_source": {
"vector": item["vector"],
"text": item["text"],
"metadata": item["metadata"],
},
}
for item in batch
]
self.client.bulk(actions)
# TODO: This currently deletes by * which is not always supported in ElasticSearch.
# Need to read a bit before changing. Also, need to delete from a specific collection
def delete(self, collection_name: str, ids: list[str]):
# Assuming ID is unique across collections and indexes
actions = [
{"delete": {"_index": f"{self.index_prefix}*", "_id": id}} for id in ids
]
self.client.bulk(body=actions)
def reset(self):
indices = self.client.indices.get(index=f"{self.index_prefix}*")
for index in indices:
self.client.indices.delete(index=index)

View File

@@ -1,20 +1,28 @@
from pymilvus import MilvusClient as Client
from pymilvus import FieldSchema, DataType
import json
import logging
from typing import Optional
from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
from open_webui.config import (
MILVUS_URI,
MILVUS_DB,
MILVUS_TOKEN,
)
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class MilvusClient:
def __init__(self):
self.collection_prefix = "open_webui"
self.client = Client(uri=MILVUS_URI, database=MILVUS_DB)
if MILVUS_TOKEN is None:
self.client = Client(uri=MILVUS_URI, db_name=MILVUS_DB)
else:
self.client = Client(uri=MILVUS_URI, db_name=MILVUS_DB, token=MILVUS_TOKEN)
def _result_to_get_result(self, result) -> GetResult:
ids = []
@@ -164,7 +172,7 @@ class MilvusClient:
try:
# Loop until there are no more items to fetch or the desired limit is reached
while remaining > 0:
print("remaining", remaining)
log.info(f"remaining: {remaining}")
current_fetch = min(
max_limit, remaining
) # Determine how many items to fetch in this iteration
@@ -191,10 +199,12 @@ class MilvusClient:
if results_count < current_fetch:
break
print(all_results)
log.debug(all_results)
return self._result_to_get_result([all_results])
except Exception as e:
print(e)
log.exception(
f"Error querying collection {collection_name} with limit {limit}: {e}"
)
return None
def get(self, collection_name: str) -> Optional[GetResult]:

View File

@@ -49,7 +49,7 @@ class OpenSearchClient:
ids=ids, distances=distances, documents=documents, metadatas=metadatas
)
def _create_index(self, index_name: str, dimension: int):
def _create_index(self, collection_name: str, dimension: int):
body = {
"mappings": {
"properties": {
@@ -72,24 +72,28 @@ class OpenSearchClient:
}
}
}
self.client.indices.create(index=f"{self.index_prefix}_{index_name}", body=body)
self.client.indices.create(
index=f"{self.index_prefix}_{collection_name}", body=body
)
def _create_batches(self, items: list[VectorItem], batch_size=100):
for i in range(0, len(items), batch_size):
yield items[i : i + batch_size]
def has_collection(self, index_name: str) -> bool:
def has_collection(self, collection_name: str) -> bool:
# has_collection here means has index.
# We are simply adapting to the norms of the other DBs.
return self.client.indices.exists(index=f"{self.index_prefix}_{index_name}")
return self.client.indices.exists(
index=f"{self.index_prefix}_{collection_name}"
)
def delete_colleciton(self, index_name: str):
def delete_colleciton(self, collection_name: str):
# delete_collection here means delete index.
# We are simply adapting to the norms of the other DBs.
self.client.indices.delete(index=f"{self.index_prefix}_{index_name}")
self.client.indices.delete(index=f"{self.index_prefix}_{collection_name}")
def search(
self, index_name: str, vectors: list[list[float]], limit: int
self, collection_name: str, vectors: list[list[float]], limit: int
) -> Optional[SearchResult]:
query = {
"size": limit,
@@ -108,26 +112,55 @@ class OpenSearchClient:
}
result = self.client.search(
index=f"{self.index_prefix}_{index_name}", body=query
index=f"{self.index_prefix}_{collection_name}", body=query
)
return self._result_to_search_result(result)
def get_or_create_index(self, index_name: str, dimension: int):
if not self.has_index(index_name):
self._create_index(index_name, dimension)
def query(
self, collection_name: str, filter: dict, limit: Optional[int] = None
) -> Optional[GetResult]:
if not self.has_collection(collection_name):
return None
def get(self, index_name: str) -> Optional[GetResult]:
query_body = {
"query": {"bool": {"filter": []}},
"_source": ["text", "metadata"],
}
for field, value in filter.items():
query_body["query"]["bool"]["filter"].append({"term": {field: value}})
size = limit if limit else 10
try:
result = self.client.search(
index=f"{self.index_prefix}_{collection_name}",
body=query_body,
size=size,
)
return self._result_to_get_result(result)
except Exception as e:
return None
def _create_index_if_not_exists(self, collection_name: str, dimension: int):
if not self.has_index(collection_name):
self._create_index(collection_name, dimension)
def get(self, collection_name: str) -> Optional[GetResult]:
query = {"query": {"match_all": {}}, "_source": ["text", "metadata"]}
result = self.client.search(
index=f"{self.index_prefix}_{index_name}", body=query
index=f"{self.index_prefix}_{collection_name}", body=query
)
return self._result_to_get_result(result)
def insert(self, index_name: str, items: list[VectorItem]):
if not self.has_index(index_name):
self._create_index(index_name, dimension=len(items[0]["vector"]))
def insert(self, collection_name: str, items: list[VectorItem]):
self._create_index_if_not_exists(
collection_name=collection_name, dimension=len(items[0]["vector"])
)
for batch in self._create_batches(items):
actions = [
@@ -145,15 +178,17 @@ class OpenSearchClient:
]
self.client.bulk(actions)
def upsert(self, index_name: str, items: list[VectorItem]):
if not self.has_index(index_name):
self._create_index(index_name, dimension=len(items[0]["vector"]))
def upsert(self, collection_name: str, items: list[VectorItem]):
self._create_index_if_not_exists(
collection_name=collection_name, dimension=len(items[0]["vector"])
)
for batch in self._create_batches(items):
actions = [
{
"index": {
"_id": item["id"],
"_index": f"{self.index_prefix}_{collection_name}",
"_source": {
"vector": item["vector"],
"text": item["text"],
@@ -165,9 +200,9 @@ class OpenSearchClient:
]
self.client.bulk(actions)
def delete(self, index_name: str, ids: list[str]):
def delete(self, collection_name: str, ids: list[str]):
actions = [
{"delete": {"_index": f"{self.index_prefix}_{index_name}", "_id": id}}
{"delete": {"_index": f"{self.index_prefix}_{collection_name}", "_id": id}}
for id in ids
]
self.client.bulk(body=actions)

View File

@@ -1,4 +1,5 @@
from typing import Optional, List, Dict, Any
import logging
from sqlalchemy import (
cast,
column,
@@ -24,9 +25,14 @@ from sqlalchemy.exc import NoSuchTableError
from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
from open_webui.config import PGVECTOR_DB_URL, PGVECTOR_INITIALIZE_MAX_VECTOR_LENGTH
from open_webui.env import SRC_LOG_LEVELS
VECTOR_LENGTH = PGVECTOR_INITIALIZE_MAX_VECTOR_LENGTH
Base = declarative_base()
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class DocumentChunk(Base):
__tablename__ = "document_chunk"
@@ -82,10 +88,10 @@ class PgvectorClient:
)
)
self.session.commit()
print("Initialization complete.")
log.info("Initialization complete.")
except Exception as e:
self.session.rollback()
print(f"Error during initialization: {e}")
log.exception(f"Error during initialization: {e}")
raise
def check_vector_length(self) -> None:
@@ -150,12 +156,12 @@ class PgvectorClient:
new_items.append(new_chunk)
self.session.bulk_save_objects(new_items)
self.session.commit()
print(
log.info(
f"Inserted {len(new_items)} items into collection '{collection_name}'."
)
except Exception as e:
self.session.rollback()
print(f"Error during insert: {e}")
log.exception(f"Error during insert: {e}")
raise
def upsert(self, collection_name: str, items: List[VectorItem]) -> None:
@@ -184,10 +190,12 @@ class PgvectorClient:
)
self.session.add(new_chunk)
self.session.commit()
print(f"Upserted {len(items)} items into collection '{collection_name}'.")
log.info(
f"Upserted {len(items)} items into collection '{collection_name}'."
)
except Exception as e:
self.session.rollback()
print(f"Error during upsert: {e}")
log.exception(f"Error during upsert: {e}")
raise
def search(
@@ -278,7 +286,7 @@ class PgvectorClient:
ids=ids, distances=distances, documents=documents, metadatas=metadatas
)
except Exception as e:
print(f"Error during search: {e}")
log.exception(f"Error during search: {e}")
return None
def query(
@@ -310,7 +318,7 @@ class PgvectorClient:
metadatas=metadatas,
)
except Exception as e:
print(f"Error during query: {e}")
log.exception(f"Error during query: {e}")
return None
def get(
@@ -334,7 +342,7 @@ class PgvectorClient:
return GetResult(ids=ids, documents=documents, metadatas=metadatas)
except Exception as e:
print(f"Error during get: {e}")
log.exception(f"Error during get: {e}")
return None
def delete(
@@ -356,22 +364,22 @@ class PgvectorClient:
)
deleted = query.delete(synchronize_session=False)
self.session.commit()
print(f"Deleted {deleted} items from collection '{collection_name}'.")
log.info(f"Deleted {deleted} items from collection '{collection_name}'.")
except Exception as e:
self.session.rollback()
print(f"Error during delete: {e}")
log.exception(f"Error during delete: {e}")
raise
def reset(self) -> None:
try:
deleted = self.session.query(DocumentChunk).delete()
self.session.commit()
print(
log.info(
f"Reset complete. Deleted {deleted} items from 'document_chunk' table."
)
except Exception as e:
self.session.rollback()
print(f"Error during reset: {e}")
log.exception(f"Error during reset: {e}")
raise
def close(self) -> None:
@@ -387,9 +395,9 @@ class PgvectorClient:
)
return exists
except Exception as e:
print(f"Error checking collection existence: {e}")
log.exception(f"Error checking collection existence: {e}")
return False
def delete_collection(self, collection_name: str) -> None:
self.delete(collection_name)
print(f"Collection '{collection_name}' deleted.")
log.info(f"Collection '{collection_name}' deleted.")

View File

@@ -1,4 +1,5 @@
from typing import Optional
import logging
from qdrant_client import QdrantClient as Qclient
from qdrant_client.http.models import PointStruct
@@ -6,9 +7,13 @@ from qdrant_client.models import models
from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
from open_webui.config import QDRANT_URI, QDRANT_API_KEY
from open_webui.env import SRC_LOG_LEVELS
NO_LIMIT = 999999999
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class QdrantClient:
def __init__(self):
@@ -49,7 +54,7 @@ class QdrantClient:
),
)
print(f"collection {collection_name_with_prefix} successfully created!")
log.info(f"collection {collection_name_with_prefix} successfully created!")
def _create_collection_if_not_exists(self, collection_name, dimension):
if not self.has_collection(collection_name=collection_name):
@@ -120,7 +125,7 @@ class QdrantClient:
)
return self._result_to_get_result(points.points)
except Exception as e:
print(e)
log.exception(f"Error querying a collection '{collection_name}': {e}")
return None
def get(self, collection_name: str) -> Optional[GetResult]:

View File

@@ -0,0 +1,65 @@
import logging
from typing import Optional
import requests
import json
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
def _parse_response(response):
result = {}
if "data" in response:
data = response["data"]
if "webPages" in data:
webPages = data["webPages"]
if "value" in webPages:
result["webpage"] = [
{
"id": item.get("id", ""),
"name": item.get("name", ""),
"url": item.get("url", ""),
"snippet": item.get("snippet", ""),
"summary": item.get("summary", ""),
"siteName": item.get("siteName", ""),
"siteIcon": item.get("siteIcon", ""),
"datePublished": item.get("datePublished", "")
or item.get("dateLastCrawled", ""),
}
for item in webPages["value"]
]
return result
def search_bocha(
api_key: str, query: str, count: int, filter_list: Optional[list[str]] = None
) -> list[SearchResult]:
"""Search using Bocha's Search API and return the results as a list of SearchResult objects.
Args:
api_key (str): A Bocha Search API key
query (str): The query to search for
"""
url = "https://api.bochaai.com/v1/web-search?utm_source=ollama"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
payload = json.dumps(
{"query": query, "summary": True, "freshness": "noLimit", "count": count}
)
response = requests.post(url, headers=headers, data=payload, timeout=5)
response.raise_for_status()
results = _parse_response(response.json())
print(results)
if filter_list:
results = get_filtered_results(results, filter_list)
return [
SearchResult(
link=result["url"], title=result.get("name"), snippet=result.get("summary")
)
for result in results.get("webpage", [])[:count]
]

View File

@@ -32,19 +32,15 @@ def search_duckduckgo(
# Convert the search results into a list
search_results = [r for r in ddgs_gen]
# Create an empty list to store the SearchResult objects
results = []
# Iterate over each search result
for result in search_results:
# Create a SearchResult object and append it to the results list
results.append(
SearchResult(
link=result["href"],
title=result.get("title"),
snippet=result.get("body"),
)
)
if filter_list:
results = get_filtered_results(results, filter_list)
search_results = get_filtered_results(search_results, filter_list)
# Return the list of search results
return results
return [
SearchResult(
link=result["href"],
title=result.get("title"),
snippet=result.get("body"),
)
for result in search_results
]

View File

@@ -0,0 +1,76 @@
import logging
from dataclasses import dataclass
from typing import Optional
import requests
from open_webui.env import SRC_LOG_LEVELS
from open_webui.retrieval.web.main import SearchResult
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
EXA_API_BASE = "https://api.exa.ai"
@dataclass
class ExaResult:
url: str
title: str
text: str
def search_exa(
api_key: str,
query: str,
count: int,
filter_list: Optional[list[str]] = None,
) -> list[SearchResult]:
"""Search using Exa Search API and return the results as a list of SearchResult objects.
Args:
api_key (str): A Exa Search API key
query (str): The query to search for
count (int): Number of results to return
filter_list (Optional[list[str]]): List of domains to filter results by
"""
log.info(f"Searching with Exa for query: {query}")
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
payload = {
"query": query,
"numResults": count or 5,
"includeDomains": filter_list,
"contents": {"text": True, "highlights": True},
"type": "auto", # Use the auto search type (keyword or neural)
}
try:
response = requests.post(
f"{EXA_API_BASE}/search", headers=headers, json=payload
)
response.raise_for_status()
data = response.json()
results = []
for result in data["results"]:
results.append(
ExaResult(
url=result["url"],
title=result["title"],
text=result["text"],
)
)
log.info(f"Found {len(results)} results")
return [
SearchResult(
link=result.url,
title=result.title,
snippet=result.text,
)
for result in results
]
except Exception as e:
log.error(f"Error searching Exa: {e}")
return []

View File

@@ -17,34 +17,53 @@ def search_google_pse(
filter_list: Optional[list[str]] = None,
) -> list[SearchResult]:
"""Search using Google's Programmable Search Engine API and return the results as a list of SearchResult objects.
Handles pagination for counts greater than 10.
Args:
api_key (str): A Programmable Search Engine API key
search_engine_id (str): A Programmable Search Engine ID
query (str): The query to search for
count (int): The number of results to return (max 100, as PSE max results per query is 10 and max page is 10)
filter_list (Optional[list[str]], optional): A list of keywords to filter out from results. Defaults to None.
Returns:
list[SearchResult]: A list of SearchResult objects.
"""
url = "https://www.googleapis.com/customsearch/v1"
headers = {"Content-Type": "application/json"}
params = {
"cx": search_engine_id,
"q": query,
"key": api_key,
"num": count,
}
all_results = []
start_index = 1 # Google PSE start parameter is 1-based
response = requests.request("GET", url, headers=headers, params=params)
response.raise_for_status()
while count > 0:
num_results_this_page = min(count, 10) # Google PSE max results per page is 10
params = {
"cx": search_engine_id,
"q": query,
"key": api_key,
"num": num_results_this_page,
"start": start_index,
}
response = requests.request("GET", url, headers=headers, params=params)
response.raise_for_status()
json_response = response.json()
results = json_response.get("items", [])
if results: # check if results are returned. If not, no more pages to fetch.
all_results.extend(results)
count -= len(
results
) # Decrement count by the number of results fetched in this page.
start_index += 10 # Increment start index for the next page
else:
break # No more results from Google PSE, break the loop
json_response = response.json()
results = json_response.get("items", [])
if filter_list:
results = get_filtered_results(results, filter_list)
all_results = get_filtered_results(all_results, filter_list)
return [
SearchResult(
link=result["link"],
title=result.get("title"),
snippet=result.get("snippet"),
)
for result in results
for result in all_results
]

View File

@@ -20,14 +20,23 @@ def search_jina(api_key: str, query: str, count: int) -> list[SearchResult]:
list[SearchResult]: A list of search results
"""
jina_search_endpoint = "https://s.jina.ai/"
headers = {"Accept": "application/json", "Authorization": f"Bearer {api_key}"}
url = str(URL(jina_search_endpoint + query))
response = requests.get(url, headers=headers)
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": api_key,
"X-Retain-Images": "none",
}
payload = {"q": query, "count": count if count <= 10 else 10}
url = str(URL(jina_search_endpoint))
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
results = []
for result in data["data"][:count]:
for result in data["data"]:
results.append(
SearchResult(
link=result["url"],

View File

@@ -1,3 +1,5 @@
import validators
from typing import Optional
from urllib.parse import urlparse
@@ -10,6 +12,8 @@ def get_filtered_results(results, filter_list):
filtered_results = []
for result in results:
url = result.get("url") or result.get("link", "")
if not validators.url(url):
continue
domain = urlparse(url).netloc
if any(domain.endswith(filtered_domain) for filtered_domain in filter_list):
filtered_results.append(result)

View File

@@ -0,0 +1,87 @@
import logging
from typing import Optional, List
import requests
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
def search_perplexity(
api_key: str,
query: str,
count: int,
filter_list: Optional[list[str]] = None,
) -> list[SearchResult]:
"""Search using Perplexity API and return the results as a list of SearchResult objects.
Args:
api_key (str): A Perplexity API key
query (str): The query to search for
count (int): Maximum number of results to return
"""
# Handle PersistentConfig object
if hasattr(api_key, "__str__"):
api_key = str(api_key)
try:
url = "https://api.perplexity.ai/chat/completions"
# Create payload for the API call
payload = {
"model": "sonar",
"messages": [
{
"role": "system",
"content": "You are a search assistant. Provide factual information with citations.",
},
{"role": "user", "content": query},
],
"temperature": 0.2, # Lower temperature for more factual responses
"stream": False,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
# Make the API request
response = requests.request("POST", url, json=payload, headers=headers)
# Parse the JSON response
json_response = response.json()
# Extract citations from the response
citations = json_response.get("citations", [])
# Create search results from citations
results = []
for i, citation in enumerate(citations[:count]):
# Extract content from the response to use as snippet
content = ""
if "choices" in json_response and json_response["choices"]:
if i == 0:
content = json_response["choices"][0]["message"]["content"]
result = {"link": citation, "title": f"Source {i+1}", "snippet": content}
results.append(result)
if filter_list:
results = get_filtered_results(results, filter_list)
return [
SearchResult(
link=result["link"], title=result["title"], snippet=result["snippet"]
)
for result in results[:count]
]
except Exception as e:
log.error(f"Error searching with Perplexity API: {e}")
return []

View File

@@ -0,0 +1,48 @@
import logging
from typing import Optional
from urllib.parse import urlencode
import requests
from open_webui.retrieval.web.main import SearchResult, get_filtered_results
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
def search_serpapi(
api_key: str,
engine: str,
query: str,
count: int,
filter_list: Optional[list[str]] = None,
) -> list[SearchResult]:
"""Search using serpapi.com's API and return the results as a list of SearchResult objects.
Args:
api_key (str): A serpapi.com API key
query (str): The query to search for
"""
url = "https://serpapi.com/search"
engine = engine or "google"
payload = {"engine": engine, "q": query, "api_key": api_key}
url = f"{url}?{urlencode(payload)}"
response = requests.request("GET", url)
json_response = response.json()
log.info(f"results from serpapi search: {json_response}")
results = sorted(
json_response.get("organic_results", []), key=lambda x: x.get("position", 0)
)
if filter_list:
results = get_filtered_results(results, filter_list)
return [
SearchResult(
link=result["link"], title=result["title"], snippet=result["snippet"]
)
for result in results[:count]
]

View File

@@ -1,4 +1,5 @@
import logging
from typing import Optional
import requests
from open_webui.retrieval.web.main import SearchResult
@@ -8,7 +9,13 @@ log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
def search_tavily(api_key: str, query: str, count: int) -> list[SearchResult]:
def search_tavily(
api_key: str,
query: str,
count: int,
filter_list: Optional[list[str]] = None,
# **kwargs,
) -> list[SearchResult]:
"""Search using Tavily's Search API and return the results as a list of SearchResult objects.
Args:
@@ -20,7 +27,6 @@ def search_tavily(api_key: str, query: str, count: int) -> list[SearchResult]:
"""
url = "https://api.tavily.com/search"
data = {"query": query, "api_key": api_key}
response = requests.post(url, json=data)
response.raise_for_status()

View File

@@ -1,19 +1,38 @@
import socket
import urllib.parse
import validators
from typing import Union, Sequence, Iterator
from langchain_community.document_loaders import (
WebBaseLoader,
)
from langchain_core.documents import Document
from open_webui.constants import ERROR_MESSAGES
from open_webui.config import ENABLE_RAG_LOCAL_WEB_FETCH
from open_webui.env import SRC_LOG_LEVELS
import asyncio
import logging
import socket
import ssl
import urllib.parse
import urllib.request
from collections import defaultdict
from datetime import datetime, time, timedelta
from typing import (
Any,
AsyncIterator,
Dict,
Iterator,
List,
Optional,
Sequence,
Union,
Literal,
)
import aiohttp
import certifi
import validators
from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from open_webui.constants import ERROR_MESSAGES
from open_webui.config import (
ENABLE_RAG_LOCAL_WEB_FETCH,
PLAYWRIGHT_WS_URI,
RAG_WEB_LOADER_ENGINE,
FIRECRAWL_API_BASE_URL,
FIRECRAWL_API_KEY,
)
from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -43,6 +62,17 @@ def validate_url(url: Union[str, Sequence[str]]):
return False
def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
valid_urls = []
for u in url:
try:
if validate_url(u):
valid_urls.append(u)
except ValueError:
continue
return valid_urls
def resolve_hostname(hostname):
# Get address information
addr_info = socket.getaddrinfo(hostname, None)
@@ -54,9 +84,381 @@ def resolve_hostname(hostname):
return ipv4_addresses, ipv6_addresses
def extract_metadata(soup, url):
metadata = {"source": url}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get("content", "No description found.")
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
return metadata
def verify_ssl_cert(url: str) -> bool:
"""Verify SSL certificate for the given URL."""
if not url.startswith("https://"):
return True
try:
hostname = url.split("://")[-1].split("/")[0]
context = ssl.create_default_context(cafile=certifi.where())
with context.wrap_socket(ssl.socket(), server_hostname=hostname) as s:
s.connect((hostname, 443))
return True
except ssl.SSLError:
return False
except Exception as e:
log.warning(f"SSL verification failed for {url}: {str(e)}")
return False
class SafeFireCrawlLoader(BaseLoader):
def __init__(
self,
web_paths,
verify_ssl: bool = True,
trust_env: bool = False,
requests_per_second: Optional[float] = None,
continue_on_failure: bool = True,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
mode: Literal["crawl", "scrape", "map"] = "crawl",
proxy: Optional[Dict[str, str]] = None,
params: Optional[Dict] = None,
):
"""Concurrent document loader for FireCrawl operations.
Executes multiple FireCrawlLoader instances concurrently using thread pooling
to improve bulk processing efficiency.
Args:
web_paths: List of URLs/paths to process.
verify_ssl: If True, verify SSL certificates.
trust_env: If True, use proxy settings from environment variables.
requests_per_second: Number of requests per second to limit to.
continue_on_failure (bool): If True, continue loading other URLs on failure.
api_key: API key for FireCrawl service. Defaults to None
(uses FIRE_CRAWL_API_KEY environment variable if not provided).
api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
mode: Operation mode selection:
- 'crawl': Website crawling mode (default)
- 'scrape': Direct page scraping
- 'map': Site map generation
proxy: Proxy override settings for the FireCrawl API.
params: The parameters to pass to the Firecrawl API.
Examples include crawlerOptions.
For more details, visit: https://github.com/mendableai/firecrawl-py
"""
proxy_server = proxy.get("server") if proxy else None
if trust_env and not proxy_server:
env_proxies = urllib.request.getproxies()
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
if env_proxy_server:
if proxy:
proxy["server"] = env_proxy_server
else:
proxy = {"server": env_proxy_server}
self.web_paths = web_paths
self.verify_ssl = verify_ssl
self.requests_per_second = requests_per_second
self.last_request_time = None
self.trust_env = trust_env
self.continue_on_failure = continue_on_failure
self.api_key = api_key
self.api_url = api_url
self.mode = mode
self.params = params
def lazy_load(self) -> Iterator[Document]:
"""Load documents concurrently using FireCrawl."""
for url in self.web_paths:
try:
self._safe_process_url_sync(url)
loader = FireCrawlLoader(
url=url,
api_key=self.api_key,
api_url=self.api_url,
mode=self.mode,
params=self.params,
)
yield from loader.lazy_load()
except Exception as e:
if self.continue_on_failure:
log.exception(e, "Error loading %s", url)
continue
raise e
async def alazy_load(self):
"""Async version of lazy_load."""
for url in self.web_paths:
try:
await self._safe_process_url(url)
loader = FireCrawlLoader(
url=url,
api_key=self.api_key,
api_url=self.api_url,
mode=self.mode,
params=self.params,
)
async for document in loader.alazy_load():
yield document
except Exception as e:
if self.continue_on_failure:
log.exception(e, "Error loading %s", url)
continue
raise e
def _verify_ssl_cert(self, url: str) -> bool:
return verify_ssl_cert(url)
async def _wait_for_rate_limit(self):
"""Wait to respect the rate limit if specified."""
if self.requests_per_second and self.last_request_time:
min_interval = timedelta(seconds=1.0 / self.requests_per_second)
time_since_last = datetime.now() - self.last_request_time
if time_since_last < min_interval:
await asyncio.sleep((min_interval - time_since_last).total_seconds())
self.last_request_time = datetime.now()
def _sync_wait_for_rate_limit(self):
"""Synchronous version of rate limit wait."""
if self.requests_per_second and self.last_request_time:
min_interval = timedelta(seconds=1.0 / self.requests_per_second)
time_since_last = datetime.now() - self.last_request_time
if time_since_last < min_interval:
time.sleep((min_interval - time_since_last).total_seconds())
self.last_request_time = datetime.now()
async def _safe_process_url(self, url: str) -> bool:
"""Perform safety checks before processing a URL."""
if self.verify_ssl and not self._verify_ssl_cert(url):
raise ValueError(f"SSL certificate verification failed for {url}")
await self._wait_for_rate_limit()
return True
def _safe_process_url_sync(self, url: str) -> bool:
"""Synchronous version of safety checks."""
if self.verify_ssl and not self._verify_ssl_cert(url):
raise ValueError(f"SSL certificate verification failed for {url}")
self._sync_wait_for_rate_limit()
return True
class SafePlaywrightURLLoader(PlaywrightURLLoader):
"""Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.
Attributes:
web_paths (List[str]): List of URLs to load.
verify_ssl (bool): If True, verify SSL certificates.
trust_env (bool): If True, use proxy settings from environment variables.
requests_per_second (Optional[float]): Number of requests per second to limit to.
continue_on_failure (bool): If True, continue loading other URLs on failure.
headless (bool): If True, the browser will run in headless mode.
proxy (dict): Proxy override settings for the Playwright session.
playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
"""
def __init__(
self,
web_paths: List[str],
verify_ssl: bool = True,
trust_env: bool = False,
requests_per_second: Optional[float] = None,
continue_on_failure: bool = True,
headless: bool = True,
remove_selectors: Optional[List[str]] = None,
proxy: Optional[Dict[str, str]] = None,
playwright_ws_url: Optional[str] = None,
):
"""Initialize with additional safety parameters and remote browser support."""
proxy_server = proxy.get("server") if proxy else None
if trust_env and not proxy_server:
env_proxies = urllib.request.getproxies()
env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
if env_proxy_server:
if proxy:
proxy["server"] = env_proxy_server
else:
proxy = {"server": env_proxy_server}
# We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
super().__init__(
urls=web_paths,
continue_on_failure=continue_on_failure,
headless=headless if playwright_ws_url is None else False,
remove_selectors=remove_selectors,
proxy=proxy,
)
self.verify_ssl = verify_ssl
self.requests_per_second = requests_per_second
self.last_request_time = None
self.playwright_ws_url = playwright_ws_url
self.trust_env = trust_env
def lazy_load(self) -> Iterator[Document]:
"""Safely load URLs synchronously with support for remote browser."""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
# Use remote browser if ws_endpoint is provided, otherwise use local browser
if self.playwright_ws_url:
browser = p.chromium.connect(self.playwright_ws_url)
else:
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
for url in self.urls:
try:
self._safe_process_url_sync(url)
page = browser.new_page()
response = page.goto(url)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
text = self.evaluator.evaluate(page, browser, response)
metadata = {"source": url}
yield Document(page_content=text, metadata=metadata)
except Exception as e:
if self.continue_on_failure:
log.exception(e, "Error loading %s", url)
continue
raise e
browser.close()
async def alazy_load(self) -> AsyncIterator[Document]:
"""Safely load URLs asynchronously with support for remote browser."""
from playwright.async_api import async_playwright
async with async_playwright() as p:
# Use remote browser if ws_endpoint is provided, otherwise use local browser
if self.playwright_ws_url:
browser = await p.chromium.connect(self.playwright_ws_url)
else:
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy
)
for url in self.urls:
try:
await self._safe_process_url(url)
page = await browser.new_page()
response = await page.goto(url)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
text = await self.evaluator.evaluate_async(page, browser, response)
metadata = {"source": url}
yield Document(page_content=text, metadata=metadata)
except Exception as e:
if self.continue_on_failure:
log.exception(e, "Error loading %s", url)
continue
raise e
await browser.close()
def _verify_ssl_cert(self, url: str) -> bool:
return verify_ssl_cert(url)
async def _wait_for_rate_limit(self):
"""Wait to respect the rate limit if specified."""
if self.requests_per_second and self.last_request_time:
min_interval = timedelta(seconds=1.0 / self.requests_per_second)
time_since_last = datetime.now() - self.last_request_time
if time_since_last < min_interval:
await asyncio.sleep((min_interval - time_since_last).total_seconds())
self.last_request_time = datetime.now()
def _sync_wait_for_rate_limit(self):
"""Synchronous version of rate limit wait."""
if self.requests_per_second and self.last_request_time:
min_interval = timedelta(seconds=1.0 / self.requests_per_second)
time_since_last = datetime.now() - self.last_request_time
if time_since_last < min_interval:
time.sleep((min_interval - time_since_last).total_seconds())
self.last_request_time = datetime.now()
async def _safe_process_url(self, url: str) -> bool:
"""Perform safety checks before processing a URL."""
if self.verify_ssl and not self._verify_ssl_cert(url):
raise ValueError(f"SSL certificate verification failed for {url}")
await self._wait_for_rate_limit()
return True
def _safe_process_url_sync(self, url: str) -> bool:
"""Synchronous version of safety checks."""
if self.verify_ssl and not self._verify_ssl_cert(url):
raise ValueError(f"SSL certificate verification failed for {url}")
self._sync_wait_for_rate_limit()
return True
class SafeWebBaseLoader(WebBaseLoader):
"""WebBaseLoader with enhanced error handling for URLs."""
def __init__(self, trust_env: bool = False, *args, **kwargs):
"""Initialize SafeWebBaseLoader
Args:
trust_env (bool, optional): set to True if using proxy to make web requests, for example
using http(s)_proxy environment variables. Defaults to False.
"""
super().__init__(*args, **kwargs)
self.trust_env = trust_env
async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str:
async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
for i in range(retries):
try:
kwargs: Dict = dict(
headers=self.session.headers,
cookies=self.session.cookies.get_dict(),
)
if not self.session.verify:
kwargs["ssl"] = False
async with session.get(
url, **(self.requests_kwargs | kwargs)
) as response:
if self.raise_for_status:
response.raise_for_status()
return await response.text()
except aiohttp.ClientConnectionError as e:
if i == retries - 1:
raise
else:
log.warning(
f"Error fetching {url} with attempt "
f"{i + 1}/{retries}: {e}. Retrying..."
)
await asyncio.sleep(cooldown * backoff**i)
raise ValueError("retry count exceeded")
def _unpack_fetch_results(
self, results: Any, urls: List[str], parser: Union[str, None] = None
) -> List[Any]:
"""Unpack fetch results into BeautifulSoup objects."""
from bs4 import BeautifulSoup
final_results = []
for i, result in enumerate(results):
url = urls[i]
if parser is None:
if url.endswith(".xml"):
parser = "xml"
else:
parser = self.default_parser
self._check_parser(parser)
final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
return final_results
async def ascrape_all(
self, urls: List[str], parser: Union[str, None] = None
) -> List[Any]:
"""Async fetch all urls, then return soups for all results."""
results = await self.fetch_all(urls)
return self._unpack_fetch_results(results, urls, parser=parser)
def lazy_load(self) -> Iterator[Document]:
"""Lazy load text from the url(s) in web_path with error handling."""
for path in self.web_paths:
@@ -65,33 +467,72 @@ class SafeWebBaseLoader(WebBaseLoader):
text = soup.get_text(**self.bs_get_text_kwargs)
# Build metadata
metadata = {"source": path}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get(
"content", "No description found."
)
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
metadata = extract_metadata(soup, path)
yield Document(page_content=text, metadata=metadata)
except Exception as e:
# Log the error and continue with the next URL
log.error(f"Error loading {path}: {e}")
log.exception(e, "Error loading %s", path)
async def alazy_load(self) -> AsyncIterator[Document]:
"""Async lazy load text from the url(s) in web_path."""
results = await self.ascrape_all(self.web_paths)
for path, soup in zip(self.web_paths, results):
text = soup.get_text(**self.bs_get_text_kwargs)
metadata = {"source": path}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get(
"content", "No description found."
)
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
yield Document(page_content=text, metadata=metadata)
async def aload(self) -> list[Document]:
"""Load data into Document objects."""
return [document async for document in self.alazy_load()]
RAG_WEB_LOADER_ENGINES = defaultdict(lambda: SafeWebBaseLoader)
RAG_WEB_LOADER_ENGINES["playwright"] = SafePlaywrightURLLoader
RAG_WEB_LOADER_ENGINES["safe_web"] = SafeWebBaseLoader
RAG_WEB_LOADER_ENGINES["firecrawl"] = SafeFireCrawlLoader
def get_web_loader(
urls: Union[str, Sequence[str]],
verify_ssl: bool = True,
requests_per_second: int = 2,
trust_env: bool = False,
):
# Check if the URL is valid
if not validate_url(urls):
raise ValueError(ERROR_MESSAGES.INVALID_URL)
return SafeWebBaseLoader(
urls,
verify_ssl=verify_ssl,
requests_per_second=requests_per_second,
continue_on_failure=True,
# Check if the URLs are valid
safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
web_loader_args = {
"web_paths": safe_urls,
"verify_ssl": verify_ssl,
"requests_per_second": requests_per_second,
"continue_on_failure": True,
"trust_env": trust_env,
}
if PLAYWRIGHT_WS_URI.value:
web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value
if RAG_WEB_LOADER_ENGINE.value == "firecrawl":
web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value
# Create the appropriate WebLoader based on the configuration
WebLoaderClass = RAG_WEB_LOADER_ENGINES[RAG_WEB_LOADER_ENGINE.value]
web_loader = WebLoaderClass(**web_loader_args)
log.debug(
"Using RAG_WEB_LOADER_ENGINE %s for %s URLs",
web_loader.__class__.__name__,
len(safe_urls),
)
return web_loader