feat: retrieval whole document mode

This commit is contained in:
Timothy J. Baek 2024-09-29 22:52:27 +02:00
parent 1d8b3b8c51
commit 6d764ee55e
5 changed files with 112 additions and 67 deletions

View File

@ -317,58 +317,63 @@ def get_rag_context(
relevant_contexts = [] relevant_contexts = []
for file in files: for file in files:
context = None if file.get("context") == "full":
context = {
collection_names = ( "documents": [[file["content"]]],
file["collection_names"] "metadatas": [[{"file_id": file["id"], "name": file["name"]}]],
if file["type"] == "collection" }
else [file["collection_name"]] if file["collection_name"] else [] else:
)
collection_names = set(collection_names).difference(extracted_collections)
if not collection_names:
log.debug(f"skipping {file} as it has already been extracted")
continue
try:
context = None context = None
if file["type"] == "text":
context = file["content"] collection_names = (
else: file["collection_names"]
if hybrid_search: if file["type"] == "collection"
try: else [file["collection_name"]] if file["collection_name"] else []
context = query_collection_with_hybrid_search( )
collection_names = set(collection_names).difference(extracted_collections)
if not collection_names:
log.debug(f"skipping {file} as it has already been extracted")
continue
try:
context = None
if file["type"] == "text":
context = file["content"]
else:
if hybrid_search:
try:
context = query_collection_with_hybrid_search(
collection_names=collection_names,
query=query,
embedding_function=embedding_function,
k=k,
reranking_function=reranking_function,
r=r,
)
except Exception as e:
log.debug(
"Error when using hybrid search, using"
" non hybrid search as fallback."
)
if (not hybrid_search) or (context is None):
context = query_collection(
collection_names=collection_names, collection_names=collection_names,
query=query, query=query,
embedding_function=embedding_function, embedding_function=embedding_function,
k=k, k=k,
reranking_function=reranking_function,
r=r,
)
except Exception as e:
log.debug(
"Error when using hybrid search, using"
" non hybrid search as fallback."
) )
except Exception as e:
log.exception(e)
if (not hybrid_search) or (context is None): extracted_collections.extend(collection_names)
context = query_collection(
collection_names=collection_names,
query=query,
embedding_function=embedding_function,
k=k,
)
except Exception as e:
log.exception(e)
if context: if context:
relevant_contexts.append({**context, "source": file}) relevant_contexts.append({**context, "file": file})
extracted_collections.extend(collection_names)
contexts = [] contexts = []
citations = [] citations = []
for context in relevant_contexts: for context in relevant_contexts:
try: try:
if "documents" in context: if "documents" in context:
@ -381,7 +386,7 @@ def get_rag_context(
if "metadatas" in context: if "metadatas" in context:
citations.append( citations.append(
{ {
"source": context["source"], "source": context["file"],
"document": context["documents"][0], "document": context["documents"][0],
"metadata": context["metadatas"][0], "metadata": context["metadatas"][0],
} }

View File

@ -36,6 +36,7 @@
<FileItem <FileItem
className="w-full" className="w-full"
{file} {file}
edit={true}
url={`${file?.url}`} url={`${file?.url}`}
name={file.name} name={file.name}
type={file.type} type={file.type}

View File

@ -459,6 +459,7 @@
size={file?.size} size={file?.size}
status={file.status} status={file.status}
dismissible={true} dismissible={true}
edit={true}
on:dismiss={() => { on:dismiss={() => {
files.splice(fileIdx, 1); files.splice(fileIdx, 1);
files = files; files = files;

View File

@ -15,7 +15,7 @@
export let status = 'processed'; export let status = 'processed';
export let file = null; export let file = null;
export let enableModal = true; export let edit = false;
export let name: string; export let name: string;
export let type: string; export let type: string;
@ -25,7 +25,7 @@
</script> </script>
{#if file} {#if file}
<FileItemModal bind:show={showModal} bind:file /> <FileItemModal bind:show={showModal} bind:file {edit} />
{/if} {/if}
<div class="relative group"> <div class="relative group">

View File

@ -7,57 +7,95 @@
import Modal from './Modal.svelte'; import Modal from './Modal.svelte';
import XMark from '../icons/XMark.svelte'; import XMark from '../icons/XMark.svelte';
import Info from '../icons/Info.svelte'; import Info from '../icons/Info.svelte';
import Switch from './Switch.svelte';
import Tooltip from './Tooltip.svelte';
export let file; export let file;
export let show = false; export let show = false;
export let edit = false;
let enableFullContent = false;
onMount(() => { onMount(() => {
console.log(file); console.log(file);
if (file?.context === 'full') {
enableFullContent = true;
}
}); });
</script> </script>
<Modal bind:show size="md"> <Modal bind:show size="md">
<div class="font-primary px-6 py-5 w-full flex flex-col justify-center dark:text-gray-400"> <div class="font-primary px-6 py-5 w-full flex flex-col justify-center dark:text-gray-400">
<div class="flex items-start justify-between pb-2"> <div class=" pb-2">
<div> <div class="flex items-start justify-between">
<div class=" font-medium text-lg dark:text-gray-100"> <div>
<a <div class=" font-medium text-lg dark:text-gray-100">
href={file.url ? (file.type === 'file' ? `${file.url}/content` : `${file.url}`) : '#'} <a
target="_blank" href={file.url ? (file.type === 'file' ? `${file.url}/content` : `${file.url}`) : '#'}
class="hover:underline line-clamp-1" target="_blank"
> class="hover:underline line-clamp-1"
{file?.name ?? 'File'} >
</a> {file?.name ?? 'File'}
</a>
</div>
</div> </div>
<div> <div>
<div class=" flex text-sm gap-1 text-gray-500"> <button
on:click={() => {
show = false;
}}
>
<XMark />
</button>
</div>
</div>
<div>
<div class="flex flex-col md:flex-row gap-1 justify-between w-full">
<div class=" flex flex-wrap text-sm gap-1 text-gray-500">
{#if file.size} {#if file.size}
<div class="capitalize">{formatFileSize(file.size)}</div> <div class="capitalize shrink-0">{formatFileSize(file.size)}</div>
{/if} {/if}
{#if file.content} {#if file.content}
<div class="capitalize">{getLineCount(file.content)} extracted lines</div> <div class="capitalize shrink-0">{getLineCount(file.content)} extracted lines</div>
<div class="flex items-center gap-1"> <div class="flex items-center gap-1 shrink-0">
<Info /> <Info />
Formatting may be inconsistent from source. Formatting may be inconsistent from source.
</div> </div>
{/if} {/if}
</div> </div>
</div>
</div>
<div> {#if edit}
<button <div>
on:click={() => { <Tooltip
show = false; content={enableFullContent
}} ? 'Inject the entire document as context for comprehensive processing.'
> : 'Default to segmented retrieval for focused and relevant content extraction.'}
<XMark /> >
</button> <div class="flex items-center gap-1.5 text-xs">
{#if enableFullContent}
Use Entire Document
{:else}
Use Focused Retrieval
{/if}
<Switch
bind:state={enableFullContent}
on:change={(e) => {
file.context = e.detail ? 'full' : undefined;
}}
/>
</div>
</Tooltip>
</div>
{/if}
</div>
</div> </div>
</div> </div>