feat: retrieval whole document mode

This commit is contained in:
Timothy J. Baek 2024-09-29 22:52:27 +02:00
parent 1d8b3b8c51
commit 6d764ee55e
5 changed files with 112 additions and 67 deletions

View File

@ -317,58 +317,63 @@ def get_rag_context(
relevant_contexts = []
for file in files:
context = None
collection_names = (
file["collection_names"]
if file["type"] == "collection"
else [file["collection_name"]] if file["collection_name"] else []
)
collection_names = set(collection_names).difference(extracted_collections)
if not collection_names:
log.debug(f"skipping {file} as it has already been extracted")
continue
try:
if file.get("context") == "full":
context = {
"documents": [[file["content"]]],
"metadatas": [[{"file_id": file["id"], "name": file["name"]}]],
}
else:
context = None
if file["type"] == "text":
context = file["content"]
else:
if hybrid_search:
try:
context = query_collection_with_hybrid_search(
collection_names = (
file["collection_names"]
if file["type"] == "collection"
else [file["collection_name"]] if file["collection_name"] else []
)
collection_names = set(collection_names).difference(extracted_collections)
if not collection_names:
log.debug(f"skipping {file} as it has already been extracted")
continue
try:
context = None
if file["type"] == "text":
context = file["content"]
else:
if hybrid_search:
try:
context = query_collection_with_hybrid_search(
collection_names=collection_names,
query=query,
embedding_function=embedding_function,
k=k,
reranking_function=reranking_function,
r=r,
)
except Exception as e:
log.debug(
"Error when using hybrid search, using"
" non hybrid search as fallback."
)
if (not hybrid_search) or (context is None):
context = query_collection(
collection_names=collection_names,
query=query,
embedding_function=embedding_function,
k=k,
reranking_function=reranking_function,
r=r,
)
except Exception as e:
log.debug(
"Error when using hybrid search, using"
" non hybrid search as fallback."
)
except Exception as e:
log.exception(e)
if (not hybrid_search) or (context is None):
context = query_collection(
collection_names=collection_names,
query=query,
embedding_function=embedding_function,
k=k,
)
except Exception as e:
log.exception(e)
extracted_collections.extend(collection_names)
if context:
relevant_contexts.append({**context, "source": file})
extracted_collections.extend(collection_names)
relevant_contexts.append({**context, "file": file})
contexts = []
citations = []
for context in relevant_contexts:
try:
if "documents" in context:
@ -381,7 +386,7 @@ def get_rag_context(
if "metadatas" in context:
citations.append(
{
"source": context["source"],
"source": context["file"],
"document": context["documents"][0],
"metadata": context["metadatas"][0],
}

View File

@ -36,6 +36,7 @@
<FileItem
className="w-full"
{file}
edit={true}
url={`${file?.url}`}
name={file.name}
type={file.type}

View File

@ -459,6 +459,7 @@
size={file?.size}
status={file.status}
dismissible={true}
edit={true}
on:dismiss={() => {
files.splice(fileIdx, 1);
files = files;

View File

@ -15,7 +15,7 @@
export let status = 'processed';
export let file = null;
export let enableModal = true;
export let edit = false;
export let name: string;
export let type: string;
@ -25,7 +25,7 @@
</script>
{#if file}
<FileItemModal bind:show={showModal} bind:file />
<FileItemModal bind:show={showModal} bind:file {edit} />
{/if}
<div class="relative group">

View File

@ -7,57 +7,95 @@
import Modal from './Modal.svelte';
import XMark from '../icons/XMark.svelte';
import Info from '../icons/Info.svelte';
import Switch from './Switch.svelte';
import Tooltip from './Tooltip.svelte';
export let file;
export let show = false;
export let edit = false;
let enableFullContent = false;
onMount(() => {
console.log(file);
if (file?.context === 'full') {
enableFullContent = true;
}
});
</script>
<Modal bind:show size="md">
<div class="font-primary px-6 py-5 w-full flex flex-col justify-center dark:text-gray-400">
<div class="flex items-start justify-between pb-2">
<div>
<div class=" font-medium text-lg dark:text-gray-100">
<a
href={file.url ? (file.type === 'file' ? `${file.url}/content` : `${file.url}`) : '#'}
target="_blank"
class="hover:underline line-clamp-1"
>
{file?.name ?? 'File'}
</a>
<div class=" pb-2">
<div class="flex items-start justify-between">
<div>
<div class=" font-medium text-lg dark:text-gray-100">
<a
href={file.url ? (file.type === 'file' ? `${file.url}/content` : `${file.url}`) : '#'}
target="_blank"
class="hover:underline line-clamp-1"
>
{file?.name ?? 'File'}
</a>
</div>
</div>
<div>
<div class=" flex text-sm gap-1 text-gray-500">
<button
on:click={() => {
show = false;
}}
>
<XMark />
</button>
</div>
</div>
<div>
<div class="flex flex-col md:flex-row gap-1 justify-between w-full">
<div class=" flex flex-wrap text-sm gap-1 text-gray-500">
{#if file.size}
<div class="capitalize">{formatFileSize(file.size)}</div>
<div class="capitalize shrink-0">{formatFileSize(file.size)}</div>
{/if}
{#if file.content}
<div class="capitalize">{getLineCount(file.content)} extracted lines</div>
<div class="capitalize shrink-0">{getLineCount(file.content)} extracted lines</div>
<div class="flex items-center gap-1">
<div class="flex items-center gap-1 shrink-0">
<Info />
Formatting may be inconsistent from source.
</div>
{/if}
</div>
</div>
</div>
<div>
<button
on:click={() => {
show = false;
}}
>
<XMark />
</button>
{#if edit}
<div>
<Tooltip
content={enableFullContent
? 'Inject the entire document as context for comprehensive processing.'
: 'Default to segmented retrieval for focused and relevant content extraction.'}
>
<div class="flex items-center gap-1.5 text-xs">
{#if enableFullContent}
Use Entire Document
{:else}
Use Focused Retrieval
{/if}
<Switch
bind:state={enableFullContent}
on:change={(e) => {
file.context = e.detail ? 'full' : undefined;
}}
/>
</div>
</Tooltip>
</div>
{/if}
</div>
</div>
</div>