revert: faulty dedup code

2025-04-27 17:51:31 +00:00 · 2025-02-20 11:02:45 -08:00 · 2025-02-20 11:02:45 -08:00 · 93d486d50e
commit 93d486d50e
parent c882aacc23
2 changed files with 16 additions and 29 deletions
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@ -1714,7 +1714,7 @@ Respond to the user query using the provided context, incorporating inline citat
 - Respond in the same language as the user's query.
 - If the context is unreadable or of poor quality, inform the user and provide the best possible answer.
 - If the answer isn't present in the context but you possess the knowledge, explain this to the user and provide the answer using your own understanding.
- **Only include inline citations using [source_id] when a <source_id> tag is explicitly provided in the context.**  
+- **Only include inline citations using [source_id] (e.g., [1], [2]) when a `<source_id>` tag is explicitly provided in the context.**
 - Do not cite if the <source_id> tag is not provided in the context.  
 - Do not use XML tags in your response.
 - Ensure citations are concise and directly related to the information provided.
--- a/backend/open_webui/retrieval/utils.py
+++ b/backend/open_webui/retrieval/utils.py
@ -14,7 +14,8 @@ from langchain_core.documents import Document

 from open_webui.config import VECTOR_DB
 from open_webui.retrieval.vector.connector import VECTOR_DB_CLIENT
-from open_webui.utils.misc import get_last_user_message
+from open_webui.utils.misc import get_last_user_message, calculate_sha256_string
+
 from open_webui.models.users import UserModel

 from open_webui.env import (
@ -178,45 +179,31 @@ def merge_and_sort_query_results(
    combined_distances = []
    combined_documents = []
    combined_metadatas = []
-    combined_ids = []

    for data in query_results:
        combined_distances.extend(data["distances"][0])
        combined_documents.extend(data["documents"][0])
        combined_metadatas.extend(data["metadatas"][0])
-        # DISTINCT(chunk_id,file_id) - in case if id (chunk_ids) become ordinals
-        combined_ids.extend(
-            [
-                f"{id}-{meta['file_id']}"
-                for id, meta in zip(data["ids"][0], data["metadatas"][0])
-            ]
-        )

-    # Create a list of tuples (distance, document, metadata, ids)
-    combined = list(
-        zip(combined_distances, combined_documents, combined_metadatas, combined_ids)
-    )
+    # Create a list of tuples (distance, document, metadata)
+    combined = list(zip(combined_distances, combined_documents, combined_metadatas))

    # Sort the list based on distances
    combined.sort(key=lambda x: x[0], reverse=reverse)

-    sorted_distances = []
-    sorted_documents = []
-    sorted_metadatas = []
-    # Otherwise we don't have anything :-(
-    if combined:
+    # We don't have anything :-(
+    if not combined:
+        sorted_distances = []
+        sorted_documents = []
+        sorted_metadatas = []
+    else:
        # Unzip the sorted list
-        all_distances, all_documents, all_metadatas, all_ids = zip(*combined)
-        seen_ids = set()
+        sorted_distances, sorted_documents, sorted_metadatas = zip(*combined)
+
        # Slicing the lists to include only k elements
-        for index, id in enumerate(all_ids):
-            if id not in seen_ids:
-                sorted_distances.append(all_distances[index])
-                sorted_documents.append(all_documents[index])
-                sorted_metadatas.append(all_metadatas[index])
-                seen_ids.add(id)
-                if len(sorted_distances) >= k:
-                    break
+        sorted_distances = list(sorted_distances)[:k]
+        sorted_documents = list(sorted_documents)[:k]
+        sorted_metadatas = list(sorted_metadatas)[:k]

    # Create the output dictionary
    result = {