mirror of
https://github.com/open-webui/open-webui
synced 2025-03-03 19:07:21 +00:00
refac: dedup
This commit is contained in:
parent
0a4dbf7cf0
commit
ce7cf62a55
@ -5,6 +5,7 @@ from typing import Optional, Union
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import requests
|
import requests
|
||||||
|
import hashlib
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
|
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
|
||||||
@ -175,46 +176,41 @@ def merge_get_results(get_results: list[dict]) -> dict:
|
|||||||
|
|
||||||
def merge_and_sort_query_results(
|
def merge_and_sort_query_results(
|
||||||
query_results: list[dict], k: int, reverse: bool = False
|
query_results: list[dict], k: int, reverse: bool = False
|
||||||
) -> list[dict]:
|
) -> dict:
|
||||||
# Initialize lists to store combined data
|
# Initialize lists to store combined data
|
||||||
combined_distances = []
|
combined = []
|
||||||
combined_documents = []
|
seen_hashes = set() # To store unique document hashes
|
||||||
combined_metadatas = []
|
|
||||||
|
|
||||||
for data in query_results:
|
for data in query_results:
|
||||||
combined_distances.extend(data["distances"][0])
|
distances = data["distances"][0]
|
||||||
combined_documents.extend(data["documents"][0])
|
documents = data["documents"][0]
|
||||||
combined_metadatas.extend(data["metadatas"][0])
|
metadatas = data["metadatas"][0]
|
||||||
|
|
||||||
# Create a list of tuples (distance, document, metadata)
|
for distance, document, metadata in zip(distances, documents, metadatas):
|
||||||
combined = list(zip(combined_distances, combined_documents, combined_metadatas))
|
if isinstance(document, str):
|
||||||
|
doc_hash = hashlib.md5(
|
||||||
|
document.encode()
|
||||||
|
).hexdigest() # Compute a hash for uniqueness
|
||||||
|
|
||||||
|
if doc_hash not in seen_hashes:
|
||||||
|
seen_hashes.add(doc_hash)
|
||||||
|
combined.append((distance, document, metadata))
|
||||||
|
|
||||||
# Sort the list based on distances
|
# Sort the list based on distances
|
||||||
combined.sort(key=lambda x: x[0], reverse=reverse)
|
combined.sort(key=lambda x: x[0], reverse=reverse)
|
||||||
|
|
||||||
# We don't have anything :-(
|
# Slice to keep only the top k elements
|
||||||
if not combined:
|
sorted_distances, sorted_documents, sorted_metadatas = (
|
||||||
sorted_distances = []
|
zip(*combined[:k]) if combined else ([], [], [])
|
||||||
sorted_documents = []
|
)
|
||||||
sorted_metadatas = []
|
|
||||||
else:
|
|
||||||
# Unzip the sorted list
|
|
||||||
sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
|
|
||||||
|
|
||||||
# Slicing the lists to include only k elements
|
# Create and return the output dictionary
|
||||||
sorted_distances = list(sorted_distances)[:k]
|
return {
|
||||||
sorted_documents = list(sorted_documents)[:k]
|
"distances": [list(sorted_distances)],
|
||||||
sorted_metadatas = list(sorted_metadatas)[:k]
|
"documents": [list(sorted_documents)],
|
||||||
|
"metadatas": [list(sorted_metadatas)],
|
||||||
# Create the output dictionary
|
|
||||||
result = {
|
|
||||||
"distances": [sorted_distances],
|
|
||||||
"documents": [sorted_documents],
|
|
||||||
"metadatas": [sorted_metadatas],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_items_from_collections(collection_names: list[str]) -> dict:
|
def get_all_items_from_collections(collection_names: list[str]) -> dict:
|
||||||
results = []
|
results = []
|
||||||
|
Loading…
Reference in New Issue
Block a user