From fb30b667e23a5cd839f8f8a16be64060ecd5fc0b Mon Sep 17 00:00:00 2001 From: David Tippett Date: Tue, 29 Oct 2024 20:28:37 -0400 Subject: [PATCH 1/7] First pass at an OpenSearch connector --- .../open_webui/apps/rag/vector/connector.py | 12 ++ .../apps/retrieval/vector/dbs/opensearch.py | 150 ++++++++++++++++++ backend/open_webui/config.py | 7 + 3 files changed, 169 insertions(+) create mode 100644 backend/open_webui/apps/rag/vector/connector.py create mode 100644 backend/open_webui/apps/retrieval/vector/dbs/opensearch.py diff --git a/backend/open_webui/apps/rag/vector/connector.py b/backend/open_webui/apps/rag/vector/connector.py new file mode 100644 index 000000000..9b9b6089e --- /dev/null +++ b/backend/open_webui/apps/rag/vector/connector.py @@ -0,0 +1,12 @@ +from open_webui.apps.rag.vector.dbs.chroma import ChromaClient +from open_webui.apps.rag.vector.dbs.milvus import MilvusClient + + +from open_webui.config import VECTOR_DB + +if VECTOR_DB == "milvus": + VECTOR_DB_CLIENT = MilvusClient() +elif VECTOR_DB == "opensearch": + VECTOR_DB_CLIENT = OpenSearchClient() +else: + VECTOR_DB_CLIENT = ChromaClient() diff --git a/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py b/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py new file mode 100644 index 000000000..24ed1324a --- /dev/null +++ b/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py @@ -0,0 +1,150 @@ +from opensearchpy import OpenSearch +from typing import Optional + +from open_webui.apps.rag.vector.main import VectorItem, SearchResult, GetResult +from open_webui.config import ( + OPENSEARCH_URI, # Assuming you define OPENSEARCH_URI in config +) + +class OpenSearchClient: + def __init__(self): + self.index_prefix = "open_webui" + self.client = OpenSearch( + hosts=[config["OPENSEARCH_URI"]], + use_ssl=OPENSEARCH_SSL, + verify_certs=OPENSEARCH_CERT_VERIFY, + http_auth=(OPENSEARCH_USERNAME,OPENSEARCH_PASSWORD), + ) + + def _result_to_get_result(self, result) -> GetResult: + ids = [] + documents = [] + metadatas = [] + + for hit in result['hits']['hits']: + ids.append(hit['_id']) + documents.append(hit['_source'].get("text")) + metadatas.append(hit['_source'].get("metadata")) + + return GetResult(ids=ids, documents=documents, metadatas=metadatas) + + def _result_to_search_result(self, result) -> SearchResult: + ids = [] + distances = [] + documents = [] + metadatas = [] + + for hit in result['hits']['hits']: + ids.append(hit['_id']) + distances.append(hit['_score']) + documents.append(hit['_source'].get("text")) + metadatas.append(hit['_source'].get("metadata")) + + return SearchResult(ids=ids, distances=distances, documents=documents, metadatas=metadatas) + + def _create_index(self, index_name: str, dimension: int): + body = { + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "vector": { + "type": "dense_vector", + "dims": dimension, # Adjust based on your vector dimensions + "index": true, + "similarity": "faiss", + "method": { + "name": "hnsw", + "space_type": "ip", # Use inner product to approximate cosine similarity + "engine": "faiss", + "ef_construction": 128, + "m": 16 + } + }, + "text": {"type": "text"}, + "metadata": {"type": "object"} + } + } + } + self.client.indices.create(index=f"{self.index_prefix}_{index_name}", body=body) + + def _create_batches(self, items: list[VectorItem], batch_size=100): + for i in range(0, len(items), batch_size): + yield items[i:i + batch_size] + + def has_collection(self, index_name: str) -> bool: + # has_collection here means has index. + # We are simply adapting to the norms of the other DBs. + return self.client.indices.exists(index=f"{self.index_prefix}_{index_name}") + + def delete_colleciton(self, index_name: str): + # delete_collection here means delete index. + # We are simply adapting to the norms of the other DBs. + self.client.indices.delete(index=f"{self.index_prefix}_{index_name}") + + def search(self, index_name: str, vectors: list[list[float]], limit: int) -> Optional[SearchResult]: + query = { + "size": limit, + "_source": ["text", "metadata"], + "query": { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.vector, 'vector') + 1.0", + "params": {"vector": vectors[0]} # Assuming single query vector + } + } + } + } + + result = self.client.search( + index=f"{self.index_prefix}_{index_name}", + body=query + ) + + return self._result_to_search_result(result) + + + def get_or_create_index(self, index_name: str, dimension: int): + if not self.has_index(index_name): + self._create_index(index_name, dimension) + + + def get(self, index_name: str) -> Optional[GetResult]: + query = { + "query": {"match_all": {}}, + "_source": ["text", "metadata"] + } + + result = self.client.search(index=f"{self.index_prefix}_{index_name}", body=query) + return self._result_to_get_result(result) + + def insert(self, index_name: str, items: list[VectorItem]): + if not self.has_index(index_name): + self._create_index(index_name, dimension=len(items[0]["vector"])) + + for batch in self._create_batches(items): + actions = [ + {"index": {"_id": item["id"], "_source": {"vector": item["vector"], "text": item["text"], "metadata": item["metadata"]}}} + for item in batch + ] + self.client.bulk(actions) + + def upsert(self, index_name: str, items: list[VectorItem]): + if not self.has_index(index_name): + self._create_index(index_name, dimension=len(items[0]["vector"])) + + for batch in self._create_batches(items): + actions = [ + {"index": {"_id": item["id"], "_source": {"vector": item["vector"], "text": item["text"], "metadata": item["metadata"]}}} + for item in batch + ] + self.client.bulk(actions) + + def delete(self, index_name: str, ids: list[str]): + actions = [{"delete": {"_index": f"{self.index_prefix}_{index_name}", "_id": id}} for id in ids] + self.client.bulk(body=actions) + + def reset(self): + indices = self.client.indices.get(index=f"{self.index_prefix}_*") + for index in indices: + self.client.indices.delete(index=index) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 06ac258f9..dd6215692 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -957,6 +957,13 @@ MILVUS_URI = os.environ.get("MILVUS_URI", f"{DATA_DIR}/vector_db/milvus.db") # Qdrant QDRANT_URI = os.environ.get("QDRANT_URI", None) +# OpenSearch +OPENSEARCH_URI = os.environ.get("OPENSEARCH_URI", "https://localhost:9200") +OPENSEARCH_SSL = os.environ.get("OPENSEARCH_SSL", True) +OPENSEARCH_CERT_VERIFY = os.environ.get("OPENSEARCH_CERT_VERIFY", False) +OPENSEARCH_USERNAME = os.environ.get("OPENSEARCH_USERNAME", None) +OPENSEARCH_PASSWORD = os.environ.get("OPENSEARCH_PASSWORD", None) + #################################### # Information Retrieval (RAG) #################################### From a39ddfd4f1028be0a53c11e7ba1b54c6cc609801 Mon Sep 17 00:00:00 2001 From: David Tippett Date: Wed, 30 Oct 2024 00:43:09 -0400 Subject: [PATCH 2/7] Adding OpenSearch python dependancy --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b631e153a..4eddd0ac6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "fake-useragent==1.5.1", "chromadb==0.5.9", "pymilvus==2.4.7", + "opensearch-py==2.7.1" "sentence-transformers==3.2.0", "colbert-ai==0.2.21", From 6aec971ddbfbea1aa0f53ade0d2c79b3fc5be569 Mon Sep 17 00:00:00 2001 From: David Tippett Date: Wed, 30 Oct 2024 00:45:55 -0400 Subject: [PATCH 3/7] Updating requirements.txt --- backend/requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 217a711ab..46b9bc28d 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -40,9 +40,10 @@ langchain-community==0.2.12 langchain-chroma==0.1.4 fake-useragent==1.5.1 -chromadb==0.5.15 -pymilvus==2.4.7 +chromadb==0.5.5 +pymilvus==2.4.6 qdrant-client~=1.12.0 +opensearch-py==2.7.1 sentence-transformers==3.2.0 colbert-ai==0.2.21 From ef7a9e24670baf20f1cd3a495bdf0bf17e2a80d2 Mon Sep 17 00:00:00 2001 From: David Tippett Date: Fri, 1 Nov 2024 21:47:05 -0400 Subject: [PATCH 4/7] Update chromadb and pymilvus versions Fixing botched merge change. --- backend/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 46b9bc28d..dcdbb1a14 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -40,8 +40,8 @@ langchain-community==0.2.12 langchain-chroma==0.1.4 fake-useragent==1.5.1 -chromadb==0.5.5 -pymilvus==2.4.6 +chromadb==0.5.15 +pymilvus==2.4.7 qdrant-client~=1.12.0 opensearch-py==2.7.1 From 16089ab9474bb0aa4d2e1d780ec323a8d575bdba Mon Sep 17 00:00:00 2001 From: David Tippett Date: Mon, 4 Nov 2024 14:59:50 -0500 Subject: [PATCH 5/7] Fix formatting in pyproject.toml dependencies list --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4eddd0ac6..19b8c2c29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ "fake-useragent==1.5.1", "chromadb==0.5.9", "pymilvus==2.4.7", - "opensearch-py==2.7.1" + "opensearch-py==2.7.1", "sentence-transformers==3.2.0", "colbert-ai==0.2.21", From afca48028f242ff58f3c4cb25e566a17f52ca8d0 Mon Sep 17 00:00:00 2001 From: David Tippett Date: Mon, 4 Nov 2024 15:10:14 -0500 Subject: [PATCH 6/7] Moving new config to under retreval --- backend/open_webui/apps/rag/vector/connector.py | 12 ------------ .../open_webui/apps/retrieval/vector/connector.py | 4 ++++ 2 files changed, 4 insertions(+), 12 deletions(-) delete mode 100644 backend/open_webui/apps/rag/vector/connector.py diff --git a/backend/open_webui/apps/rag/vector/connector.py b/backend/open_webui/apps/rag/vector/connector.py deleted file mode 100644 index 9b9b6089e..000000000 --- a/backend/open_webui/apps/rag/vector/connector.py +++ /dev/null @@ -1,12 +0,0 @@ -from open_webui.apps.rag.vector.dbs.chroma import ChromaClient -from open_webui.apps.rag.vector.dbs.milvus import MilvusClient - - -from open_webui.config import VECTOR_DB - -if VECTOR_DB == "milvus": - VECTOR_DB_CLIENT = MilvusClient() -elif VECTOR_DB == "opensearch": - VECTOR_DB_CLIENT = OpenSearchClient() -else: - VECTOR_DB_CLIENT = ChromaClient() diff --git a/backend/open_webui/apps/retrieval/vector/connector.py b/backend/open_webui/apps/retrieval/vector/connector.py index c7f00f5fd..b95b55783 100644 --- a/backend/open_webui/apps/retrieval/vector/connector.py +++ b/backend/open_webui/apps/retrieval/vector/connector.py @@ -8,6 +8,10 @@ elif VECTOR_DB == "qdrant": from open_webui.apps.retrieval.vector.dbs.qdrant import QdrantClient VECTOR_DB_CLIENT = QdrantClient() +elif VECTOR_DB == "opensearch": + from open_webui.apps.retrieval.vector.dbs.opensearch import OpenSearchClient + + VECTOR_DB_CLIENT = OpenSearchClient() else: from open_webui.apps.retrieval.vector.dbs.chroma import ChromaClient From 705e3129b69b6530546ca490508671162b2ebf67 Mon Sep 17 00:00:00 2001 From: David Tippett Date: Mon, 4 Nov 2024 15:14:53 -0500 Subject: [PATCH 7/7] Updating config format. --- .../apps/retrieval/vector/dbs/opensearch.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py b/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py index 24ed1324a..00027dfb9 100644 --- a/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py +++ b/backend/open_webui/apps/retrieval/vector/dbs/opensearch.py @@ -3,17 +3,21 @@ from typing import Optional from open_webui.apps.rag.vector.main import VectorItem, SearchResult, GetResult from open_webui.config import ( - OPENSEARCH_URI, # Assuming you define OPENSEARCH_URI in config + OPENSEARCH_URI, + OPENSEARCH_SSL, + OPENSEARCH_CERT_VERIFY, + OPENSEARCH_USERNAME, + OPENSEARCH_PASSWORD ) class OpenSearchClient: def __init__(self): self.index_prefix = "open_webui" self.client = OpenSearch( - hosts=[config["OPENSEARCH_URI"]], + hosts=[OPENSEARCH_URI], use_ssl=OPENSEARCH_SSL, verify_certs=OPENSEARCH_CERT_VERIFY, - http_auth=(OPENSEARCH_USERNAME,OPENSEARCH_PASSWORD), + http_auth=(OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD), ) def _result_to_get_result(self, result) -> GetResult: @@ -103,12 +107,10 @@ class OpenSearchClient: return self._result_to_search_result(result) - def get_or_create_index(self, index_name: str, dimension: int): if not self.has_index(index_name): self._create_index(index_name, dimension) - def get(self, index_name: str) -> Optional[GetResult]: query = { "query": {"match_all": {}},