From 6d764ee55ef6c685fd64b089200de76a49844970 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Sun, 29 Sep 2024 22:52:27 +0200 Subject: [PATCH] feat: retrieval whole document mode --- backend/open_webui/apps/retrieval/utils.py | 87 ++++++++++--------- .../components/chat/Controls/Controls.svelte | 1 + src/lib/components/chat/MessageInput.svelte | 1 + src/lib/components/common/FileItem.svelte | 4 +- .../components/common/FileItemModal.svelte | 86 +++++++++++++----- 5 files changed, 112 insertions(+), 67 deletions(-) diff --git a/backend/open_webui/apps/retrieval/utils.py b/backend/open_webui/apps/retrieval/utils.py index 1fa30e6a0..6b12f76a1 100644 --- a/backend/open_webui/apps/retrieval/utils.py +++ b/backend/open_webui/apps/retrieval/utils.py @@ -317,58 +317,63 @@ def get_rag_context( relevant_contexts = [] for file in files: - context = None - - collection_names = ( - file["collection_names"] - if file["type"] == "collection" - else [file["collection_name"]] if file["collection_name"] else [] - ) - - collection_names = set(collection_names).difference(extracted_collections) - if not collection_names: - log.debug(f"skipping {file} as it has already been extracted") - continue - - try: + if file.get("context") == "full": + context = { + "documents": [[file["content"]]], + "metadatas": [[{"file_id": file["id"], "name": file["name"]}]], + } + else: context = None - if file["type"] == "text": - context = file["content"] - else: - if hybrid_search: - try: - context = query_collection_with_hybrid_search( + + collection_names = ( + file["collection_names"] + if file["type"] == "collection" + else [file["collection_name"]] if file["collection_name"] else [] + ) + + collection_names = set(collection_names).difference(extracted_collections) + if not collection_names: + log.debug(f"skipping {file} as it has already been extracted") + continue + + try: + context = None + if file["type"] == "text": + context = file["content"] + else: + if hybrid_search: + try: + context = query_collection_with_hybrid_search( + collection_names=collection_names, + query=query, + embedding_function=embedding_function, + k=k, + reranking_function=reranking_function, + r=r, + ) + except Exception as e: + log.debug( + "Error when using hybrid search, using" + " non hybrid search as fallback." + ) + + if (not hybrid_search) or (context is None): + context = query_collection( collection_names=collection_names, query=query, embedding_function=embedding_function, k=k, - reranking_function=reranking_function, - r=r, - ) - except Exception as e: - log.debug( - "Error when using hybrid search, using" - " non hybrid search as fallback." ) + except Exception as e: + log.exception(e) - if (not hybrid_search) or (context is None): - context = query_collection( - collection_names=collection_names, - query=query, - embedding_function=embedding_function, - k=k, - ) - except Exception as e: - log.exception(e) + extracted_collections.extend(collection_names) if context: - relevant_contexts.append({**context, "source": file}) - - extracted_collections.extend(collection_names) + relevant_contexts.append({**context, "file": file}) contexts = [] citations = [] - for context in relevant_contexts: try: if "documents" in context: @@ -381,7 +386,7 @@ def get_rag_context( if "metadatas" in context: citations.append( { - "source": context["source"], + "source": context["file"], "document": context["documents"][0], "metadata": context["metadatas"][0], } diff --git a/src/lib/components/chat/Controls/Controls.svelte b/src/lib/components/chat/Controls/Controls.svelte index d1246266f..f5807b9b8 100644 --- a/src/lib/components/chat/Controls/Controls.svelte +++ b/src/lib/components/chat/Controls/Controls.svelte @@ -36,6 +36,7 @@ { files.splice(fileIdx, 1); files = files; diff --git a/src/lib/components/common/FileItem.svelte b/src/lib/components/common/FileItem.svelte index 2dea203d5..51fb44f2b 100644 --- a/src/lib/components/common/FileItem.svelte +++ b/src/lib/components/common/FileItem.svelte @@ -15,7 +15,7 @@ export let status = 'processed'; export let file = null; - export let enableModal = true; + export let edit = false; export let name: string; export let type: string; @@ -25,7 +25,7 @@ {#if file} - + {/if}
diff --git a/src/lib/components/common/FileItemModal.svelte b/src/lib/components/common/FileItemModal.svelte index c124a45c7..70755885a 100644 --- a/src/lib/components/common/FileItemModal.svelte +++ b/src/lib/components/common/FileItemModal.svelte @@ -7,57 +7,95 @@ import Modal from './Modal.svelte'; import XMark from '../icons/XMark.svelte'; import Info from '../icons/Info.svelte'; + import Switch from './Switch.svelte'; + import Tooltip from './Tooltip.svelte'; export let file; export let show = false; + export let edit = false; + + let enableFullContent = false; + onMount(() => { console.log(file); + + if (file?.context === 'full') { + enableFullContent = true; + } });
-
-
-
- - {file?.name ?? 'File'} - +
+
+
-
+ +
+
+ +
+
+
{#if file.size} -
{formatFileSize(file.size)}
+
{formatFileSize(file.size)}
• {/if} {#if file.content} -
{getLineCount(file.content)} extracted lines
+
{getLineCount(file.content)} extracted lines
-
+
Formatting may be inconsistent from source.
{/if}
-
-
-
- + {#if edit} +
+ +
+ {#if enableFullContent} + Use Entire Document + {:else} + Use Focused Retrieval + {/if} + { + file.context = e.detail ? 'full' : undefined; + }} + /> +
+
+
+ {/if} +