refac: dedup

This commit is contained in:
Timothy Jaeryang Baek 2025-02-26 23:51:39 -08:00
parent 0a4dbf7cf0
commit ce7cf62a55

View File

@ -5,6 +5,7 @@ from typing import Optional, Union
import asyncio import asyncio
import requests import requests
import hashlib
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
@ -175,46 +176,41 @@ def merge_get_results(get_results: list[dict]) -> dict:
def merge_and_sort_query_results( def merge_and_sort_query_results(
query_results: list[dict], k: int, reverse: bool = False query_results: list[dict], k: int, reverse: bool = False
) -> list[dict]: ) -> dict:
# Initialize lists to store combined data # Initialize lists to store combined data
combined_distances = [] combined = []
combined_documents = [] seen_hashes = set() # To store unique document hashes
combined_metadatas = []
for data in query_results: for data in query_results:
combined_distances.extend(data["distances"][0]) distances = data["distances"][0]
combined_documents.extend(data["documents"][0]) documents = data["documents"][0]
combined_metadatas.extend(data["metadatas"][0]) metadatas = data["metadatas"][0]
# Create a list of tuples (distance, document, metadata) for distance, document, metadata in zip(distances, documents, metadatas):
combined = list(zip(combined_distances, combined_documents, combined_metadatas)) if isinstance(document, str):
doc_hash = hashlib.md5(
document.encode()
).hexdigest() # Compute a hash for uniqueness
if doc_hash not in seen_hashes:
seen_hashes.add(doc_hash)
combined.append((distance, document, metadata))
# Sort the list based on distances # Sort the list based on distances
combined.sort(key=lambda x: x[0], reverse=reverse) combined.sort(key=lambda x: x[0], reverse=reverse)
# We don't have anything :-( # Slice to keep only the top k elements
if not combined: sorted_distances, sorted_documents, sorted_metadatas = (
sorted_distances = [] zip(*combined[:k]) if combined else ([], [], [])
sorted_documents = [] )
sorted_metadatas = []
else:
# Unzip the sorted list
sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
# Slicing the lists to include only k elements # Create and return the output dictionary
sorted_distances = list(sorted_distances)[:k] return {
sorted_documents = list(sorted_documents)[:k] "distances": [list(sorted_distances)],
sorted_metadatas = list(sorted_metadatas)[:k] "documents": [list(sorted_documents)],
"metadatas": [list(sorted_metadatas)],
# Create the output dictionary
result = {
"distances": [sorted_distances],
"documents": [sorted_documents],
"metadatas": [sorted_metadatas],
} }
return result
def get_all_items_from_collections(collection_names: list[str]) -> dict: def get_all_items_from_collections(collection_names: list[str]) -> dict:
results = [] results = []