mirror of
https://github.com/open-webui/open-webui
synced 2025-04-29 18:51:03 +00:00
Merge pull request #10272 from mkhludnev/dedupe-rag-docs
fix: dedupe results from multiple queries
This commit is contained in:
commit
8f7528a0bc
@ -138,37 +138,44 @@ def query_doc_with_hybrid_search(
|
|||||||
|
|
||||||
|
|
||||||
def merge_and_sort_query_results(
|
def merge_and_sort_query_results(
|
||||||
query_results: list[dict], k: int, reverse: bool = False
|
query_results: list[dict], k: int, reverse: bool = False
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
# Initialize lists to store combined data
|
# Initialize lists to store combined data
|
||||||
combined_distances = []
|
combined_distances = []
|
||||||
combined_documents = []
|
combined_documents = []
|
||||||
combined_metadatas = []
|
combined_metadatas = []
|
||||||
|
combined_ids = []
|
||||||
|
|
||||||
for data in query_results:
|
for data in query_results:
|
||||||
combined_distances.extend(data["distances"][0])
|
combined_distances.extend(data["distances"][0])
|
||||||
combined_documents.extend(data["documents"][0])
|
combined_documents.extend(data["documents"][0])
|
||||||
combined_metadatas.extend(data["metadatas"][0])
|
combined_metadatas.extend(data["metadatas"][0])
|
||||||
|
# DISTINCT(chunk_id,file_id) - in case if id (chunk_ids) become ordinals
|
||||||
|
combined_ids.extend([id + meta["file_id"] for id, meta in zip(data["ids"][0], data["metadatas"][0])])
|
||||||
|
|
||||||
# Create a list of tuples (distance, document, metadata)
|
# Create a list of tuples (distance, document, metadata, ids)
|
||||||
combined = list(zip(combined_distances, combined_documents, combined_metadatas))
|
combined = list(zip(combined_distances, combined_documents, combined_metadatas, combined_ids))
|
||||||
|
|
||||||
# Sort the list based on distances
|
# Sort the list based on distances
|
||||||
combined.sort(key=lambda x: x[0], reverse=reverse)
|
combined.sort(key=lambda x: x[0], reverse=reverse)
|
||||||
|
|
||||||
# We don't have anything :-(
|
sorted_distances = []
|
||||||
if not combined:
|
sorted_documents = []
|
||||||
sorted_distances = []
|
sorted_metadatas = []
|
||||||
sorted_documents = []
|
# Otherwise we don't have anything :-(
|
||||||
sorted_metadatas = []
|
if combined:
|
||||||
else:
|
|
||||||
# Unzip the sorted list
|
# Unzip the sorted list
|
||||||
sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
|
all_distances, all_documents, all_metadatas, all_ids = zip(*combined)
|
||||||
|
seen_ids = set()
|
||||||
# Slicing the lists to include only k elements
|
# Slicing the lists to include only k elements
|
||||||
sorted_distances = list(sorted_distances)[:k]
|
for index, id in enumerate(all_ids):
|
||||||
sorted_documents = list(sorted_documents)[:k]
|
if id not in seen_ids:
|
||||||
sorted_metadatas = list(sorted_metadatas)[:k]
|
sorted_distances.append(all_distances[index])
|
||||||
|
sorted_documents.append(all_documents[index])
|
||||||
|
sorted_metadatas.append(all_metadatas[index])
|
||||||
|
seen_ids.add(id)
|
||||||
|
if len(sorted_distances) >= k:
|
||||||
|
break
|
||||||
|
|
||||||
# Create the output dictionary
|
# Create the output dictionary
|
||||||
result = {
|
result = {
|
||||||
|
Loading…
Reference in New Issue
Block a user