Merge pull request #10272 from mkhludnev/dedupe-rag-docs

fix: dedupe results from multiple queries
This commit is contained in:
Timothy Jaeryang Baek 2025-02-18 19:34:33 -08:00 committed by GitHub
commit 8f7528a0bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -138,37 +138,44 @@ def query_doc_with_hybrid_search(
def merge_and_sort_query_results( def merge_and_sort_query_results(
query_results: list[dict], k: int, reverse: bool = False query_results: list[dict], k: int, reverse: bool = False
) -> list[dict]: ) -> list[dict]:
# Initialize lists to store combined data # Initialize lists to store combined data
combined_distances = [] combined_distances = []
combined_documents = [] combined_documents = []
combined_metadatas = [] combined_metadatas = []
combined_ids = []
for data in query_results: for data in query_results:
combined_distances.extend(data["distances"][0]) combined_distances.extend(data["distances"][0])
combined_documents.extend(data["documents"][0]) combined_documents.extend(data["documents"][0])
combined_metadatas.extend(data["metadatas"][0]) combined_metadatas.extend(data["metadatas"][0])
# DISTINCT(chunk_id,file_id) - in case if id (chunk_ids) become ordinals
combined_ids.extend([id + meta["file_id"] for id, meta in zip(data["ids"][0], data["metadatas"][0])])
# Create a list of tuples (distance, document, metadata) # Create a list of tuples (distance, document, metadata, ids)
combined = list(zip(combined_distances, combined_documents, combined_metadatas)) combined = list(zip(combined_distances, combined_documents, combined_metadatas, combined_ids))
# Sort the list based on distances # Sort the list based on distances
combined.sort(key=lambda x: x[0], reverse=reverse) combined.sort(key=lambda x: x[0], reverse=reverse)
# We don't have anything :-( sorted_distances = []
if not combined: sorted_documents = []
sorted_distances = [] sorted_metadatas = []
sorted_documents = [] # Otherwise we don't have anything :-(
sorted_metadatas = [] if combined:
else:
# Unzip the sorted list # Unzip the sorted list
sorted_distances, sorted_documents, sorted_metadatas = zip(*combined) all_distances, all_documents, all_metadatas, all_ids = zip(*combined)
seen_ids = set()
# Slicing the lists to include only k elements # Slicing the lists to include only k elements
sorted_distances = list(sorted_distances)[:k] for index, id in enumerate(all_ids):
sorted_documents = list(sorted_documents)[:k] if id not in seen_ids:
sorted_metadatas = list(sorted_metadatas)[:k] sorted_distances.append(all_distances[index])
sorted_documents.append(all_documents[index])
sorted_metadatas.append(all_metadatas[index])
seen_ids.add(id)
if len(sorted_distances) >= k:
break
# Create the output dictionary # Create the output dictionary
result = { result = {