Fix: Normalze all database distances to score in [0, 1]

This commit is contained in:
Marko Henning
2025-03-25 16:46:14 +01:00
parent 8aa6dade41
commit 94d9d3d590
6 changed files with 22 additions and 25 deletions

View File

@@ -75,10 +75,16 @@ class ChromaClient:
n_results=limit,
)
# chromadb has cosine distance, 2 (worst) -> 0 (best). Re-odering to 0 -> 1
# https://docs.trychroma.com/docs/collections/configure cosine equation
distances: list = result["distances"][0]
distances = [2 - dist for dist in distances]
distances = [[dist/2 for dist in distances]]
return SearchResult(
**{
"ids": result["ids"],
"distances": result["distances"],
"distances": distances,
"documents": result["documents"],
"metadatas": result["metadatas"],
}

View File

@@ -64,7 +64,10 @@ class MilvusClient:
for item in match:
_ids.append(item.get("id"))
_distances.append(item.get("distance"))
# normalize milvus score from [-1, 1] to [0, 1] range
# https://milvus.io/docs/de/metric.md
_dist = (item.get("distance") + 1.0)/2.0
_distances.append(_dist)
_documents.append(item.get("entity", {}).get("data", {}).get("text"))
_metadatas.append(item.get("entity", {}).get("metadata"))

View File

@@ -120,7 +120,7 @@ class OpenSearchClient:
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_value, doc[params.field]) + 1.0",
"source": "(cosineSimilarity(params.query_value, doc[params.field]) + 1.0) / 2.0",
"params": {
"field": "vector",
"query_value": vectors[0],

View File

@@ -278,7 +278,9 @@ class PgvectorClient:
for row in results:
qid = int(row.qid)
ids[qid].append(row.id)
distances[qid].append(row.distance)
# normalize and re-orders pgvec distance from [2, 0] to [0, 1] score range
# https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
distances[qid].append((2.0 - row.distance)/2.0)
documents[qid].append(row.text)
metadatas[qid].append(row.vmetadata)

View File

@@ -99,7 +99,8 @@ class QdrantClient:
ids=get_result.ids,
documents=get_result.documents,
metadatas=get_result.metadatas,
distances=[[point.score for point in query_response.points]],
# qdrant distance is [-1, 1], normalize to [0, 1]
distances=[[(point.score + 1.0)/2.0 for point in query_response.points]],
)
def query(self, collection_name: str, filter: dict, limit: Optional[int] = None):