Merge remote-tracking branch 'origin/dev' into Anush008/main
Signed-off-by: Anush008 <anushshetty90@gmail.com>
This commit is contained in:
@@ -507,6 +507,7 @@ class MistralLoader:
|
||||
timeout=timeout,
|
||||
headers={"User-Agent": "OpenWebUI-MistralLoader/2.0"},
|
||||
raise_for_status=False, # We handle status codes manually
|
||||
trust_env=True,
|
||||
) as session:
|
||||
yield session
|
||||
|
||||
|
||||
@@ -460,20 +460,19 @@ def get_sources_from_files(
|
||||
)
|
||||
|
||||
extracted_collections = []
|
||||
relevant_contexts = []
|
||||
query_results = []
|
||||
|
||||
for file in files:
|
||||
|
||||
context = None
|
||||
query_result = None
|
||||
if file.get("docs"):
|
||||
# BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
|
||||
context = {
|
||||
query_result = {
|
||||
"documents": [[doc.get("content") for doc in file.get("docs")]],
|
||||
"metadatas": [[doc.get("metadata") for doc in file.get("docs")]],
|
||||
}
|
||||
elif file.get("context") == "full":
|
||||
# Manual Full Mode Toggle
|
||||
context = {
|
||||
query_result = {
|
||||
"documents": [[file.get("file").get("data", {}).get("content")]],
|
||||
"metadatas": [[{"file_id": file.get("id"), "name": file.get("name")}]],
|
||||
}
|
||||
@@ -500,7 +499,7 @@ def get_sources_from_files(
|
||||
}
|
||||
)
|
||||
|
||||
context = {
|
||||
query_result = {
|
||||
"documents": [documents],
|
||||
"metadatas": [metadatas],
|
||||
}
|
||||
@@ -508,7 +507,7 @@ def get_sources_from_files(
|
||||
elif file.get("id"):
|
||||
file_object = Files.get_file_by_id(file.get("id"))
|
||||
if file_object:
|
||||
context = {
|
||||
query_result = {
|
||||
"documents": [[file_object.data.get("content", "")]],
|
||||
"metadatas": [
|
||||
[
|
||||
@@ -521,7 +520,7 @@ def get_sources_from_files(
|
||||
],
|
||||
}
|
||||
elif file.get("file").get("data"):
|
||||
context = {
|
||||
query_result = {
|
||||
"documents": [[file.get("file").get("data", {}).get("content")]],
|
||||
"metadatas": [
|
||||
[file.get("file").get("data", {}).get("metadata", {})]
|
||||
@@ -549,19 +548,27 @@ def get_sources_from_files(
|
||||
|
||||
if full_context:
|
||||
try:
|
||||
context = get_all_items_from_collections(collection_names)
|
||||
query_result = get_all_items_from_collections(collection_names)
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
|
||||
else:
|
||||
try:
|
||||
context = None
|
||||
query_result = None
|
||||
if file.get("type") == "text":
|
||||
context = file["content"]
|
||||
# Not sure when this is used, but it seems to be a fallback
|
||||
query_result = {
|
||||
"documents": [
|
||||
[file.get("file").get("data", {}).get("content")]
|
||||
],
|
||||
"metadatas": [
|
||||
[file.get("file").get("data", {}).get("meta", {})]
|
||||
],
|
||||
}
|
||||
else:
|
||||
if hybrid_search:
|
||||
try:
|
||||
context = query_collection_with_hybrid_search(
|
||||
query_result = query_collection_with_hybrid_search(
|
||||
collection_names=collection_names,
|
||||
queries=queries,
|
||||
embedding_function=embedding_function,
|
||||
@@ -577,8 +584,8 @@ def get_sources_from_files(
|
||||
" non hybrid search as fallback."
|
||||
)
|
||||
|
||||
if (not hybrid_search) or (context is None):
|
||||
context = query_collection(
|
||||
if (not hybrid_search) or (query_result is None):
|
||||
query_result = query_collection(
|
||||
collection_names=collection_names,
|
||||
queries=queries,
|
||||
embedding_function=embedding_function,
|
||||
@@ -589,24 +596,24 @@ def get_sources_from_files(
|
||||
|
||||
extracted_collections.extend(collection_names)
|
||||
|
||||
if context:
|
||||
if query_result:
|
||||
if "data" in file:
|
||||
del file["data"]
|
||||
|
||||
relevant_contexts.append({**context, "file": file})
|
||||
query_results.append({**query_result, "file": file})
|
||||
|
||||
sources = []
|
||||
for context in relevant_contexts:
|
||||
for query_result in query_results:
|
||||
try:
|
||||
if "documents" in context:
|
||||
if "metadatas" in context:
|
||||
if "documents" in query_result:
|
||||
if "metadatas" in query_result:
|
||||
source = {
|
||||
"source": context["file"],
|
||||
"document": context["documents"][0],
|
||||
"metadata": context["metadatas"][0],
|
||||
"source": query_result["file"],
|
||||
"document": query_result["documents"][0],
|
||||
"metadata": query_result["metadatas"][0],
|
||||
}
|
||||
if "distances" in context and context["distances"]:
|
||||
source["distances"] = context["distances"][0]
|
||||
if "distances" in query_result and query_result["distances"]:
|
||||
source["distances"] = query_result["distances"][0]
|
||||
|
||||
sources.append(source)
|
||||
except Exception as e:
|
||||
|
||||
@@ -157,10 +157,10 @@ class OpenSearchClient(VectorDBBase):
|
||||
|
||||
for field, value in filter.items():
|
||||
query_body["query"]["bool"]["filter"].append(
|
||||
{"match": {"metadata." + str(field): value}}
|
||||
{"term": {"metadata." + str(field) + ".keyword": value}}
|
||||
)
|
||||
|
||||
size = limit if limit else 10
|
||||
size = limit if limit else 10000
|
||||
|
||||
try:
|
||||
result = self.client.search(
|
||||
@@ -206,6 +206,7 @@ class OpenSearchClient(VectorDBBase):
|
||||
for item in batch
|
||||
]
|
||||
bulk(self.client, actions)
|
||||
self.client.indices.refresh(self._get_index_name(collection_name))
|
||||
|
||||
def upsert(self, collection_name: str, items: list[VectorItem]):
|
||||
self._create_index_if_not_exists(
|
||||
@@ -228,6 +229,7 @@ class OpenSearchClient(VectorDBBase):
|
||||
for item in batch
|
||||
]
|
||||
bulk(self.client, actions)
|
||||
self.client.indices.refresh(self._get_index_name(collection_name))
|
||||
|
||||
def delete(
|
||||
self,
|
||||
@@ -251,11 +253,12 @@ class OpenSearchClient(VectorDBBase):
|
||||
}
|
||||
for field, value in filter.items():
|
||||
query_body["query"]["bool"]["filter"].append(
|
||||
{"match": {"metadata." + str(field): value}}
|
||||
{"term": {"metadata." + str(field) + ".keyword": value}}
|
||||
)
|
||||
self.client.delete_by_query(
|
||||
index=self._get_index_name(collection_name), body=query_body
|
||||
)
|
||||
self.client.indices.refresh(self._get_index_name(collection_name))
|
||||
|
||||
def reset(self):
|
||||
indices = self.client.indices.get(index=f"{self.index_prefix}_*")
|
||||
|
||||
@@ -18,6 +18,7 @@ from open_webui.config import (
|
||||
QDRANT_ON_DISK,
|
||||
QDRANT_GRPC_PORT,
|
||||
QDRANT_PREFER_GRPC,
|
||||
QDRANT_COLLECTION_PREFIX,
|
||||
)
|
||||
from open_webui.env import SRC_LOG_LEVELS
|
||||
|
||||
@@ -29,7 +30,7 @@ log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||
|
||||
class QdrantClient(VectorDBBase):
|
||||
def __init__(self):
|
||||
self.collection_prefix = "open-webui"
|
||||
self.collection_prefix = QDRANT_COLLECTION_PREFIX
|
||||
self.QDRANT_URI = QDRANT_URI
|
||||
self.QDRANT_API_KEY = QDRANT_API_KEY
|
||||
self.QDRANT_ON_DISK = QDRANT_ON_DISK
|
||||
@@ -86,6 +87,25 @@ class QdrantClient(VectorDBBase):
|
||||
),
|
||||
)
|
||||
|
||||
# Create payload indexes for efficient filtering
|
||||
self.client.create_payload_index(
|
||||
collection_name=collection_name_with_prefix,
|
||||
field_name="metadata.hash",
|
||||
field_schema=models.KeywordIndexParams(
|
||||
type=models.KeywordIndexType.KEYWORD,
|
||||
is_tenant=False,
|
||||
on_disk=self.QDRANT_ON_DISK,
|
||||
),
|
||||
)
|
||||
self.client.create_payload_index(
|
||||
collection_name=collection_name_with_prefix,
|
||||
field_name="metadata.file_id",
|
||||
field_schema=models.KeywordIndexParams(
|
||||
type=models.KeywordIndexType.KEYWORD,
|
||||
is_tenant=False,
|
||||
on_disk=self.QDRANT_ON_DISK,
|
||||
),
|
||||
)
|
||||
log.info(f"collection {collection_name_with_prefix} successfully created!")
|
||||
|
||||
def _create_collection_if_not_exists(self, collection_name, dimension):
|
||||
|
||||
@@ -9,6 +9,7 @@ from open_webui.config import (
|
||||
QDRANT_ON_DISK,
|
||||
QDRANT_PREFER_GRPC,
|
||||
QDRANT_URI,
|
||||
QDRANT_COLLECTION_PREFIX,
|
||||
)
|
||||
from open_webui.env import SRC_LOG_LEVELS
|
||||
from open_webui.retrieval.vector.main import (
|
||||
@@ -31,7 +32,7 @@ log.setLevel(SRC_LOG_LEVELS["RAG"])
|
||||
|
||||
class QdrantClient(VectorDBBase):
|
||||
def __init__(self):
|
||||
self.collection_prefix = "open-webui"
|
||||
self.collection_prefix = QDRANT_COLLECTION_PREFIX
|
||||
self.QDRANT_URI = QDRANT_URI
|
||||
self.QDRANT_API_KEY = QDRANT_API_KEY
|
||||
self.QDRANT_ON_DISK = QDRANT_ON_DISK
|
||||
|
||||
@@ -36,7 +36,9 @@ def search_brave(
|
||||
|
||||
return [
|
||||
SearchResult(
|
||||
link=result["url"], title=result.get("title"), snippet=result.get("snippet")
|
||||
link=result["url"],
|
||||
title=result.get("title"),
|
||||
snippet=result.get("description"),
|
||||
)
|
||||
for result in results[:count]
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user