refac: documents file handling

This commit is contained in:
Timothy J. Baek 2024-07-15 13:05:38 +02:00
parent 5e8a74ef74
commit dbc352f01b
4 changed files with 49 additions and 12 deletions

View File

@ -930,7 +930,9 @@ def store_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
) )
def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: def store_data_in_vector_db(
data, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
) -> bool:
text_splitter = RecursiveCharacterTextSplitter( text_splitter = RecursiveCharacterTextSplitter(
chunk_size=app.state.config.CHUNK_SIZE, chunk_size=app.state.config.CHUNK_SIZE,
@ -942,7 +944,7 @@ def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> b
if len(docs) > 0: if len(docs) > 0:
log.info(f"store_data_in_vector_db {docs}") log.info(f"store_data_in_vector_db {docs}")
return store_docs_in_vector_db(docs, collection_name, overwrite), None return store_docs_in_vector_db(docs, collection_name, metadata, overwrite), None
else: else:
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
@ -956,14 +958,16 @@ def store_text_in_vector_db(
add_start_index=True, add_start_index=True,
) )
docs = text_splitter.create_documents([text], metadatas=[metadata]) docs = text_splitter.create_documents([text], metadatas=[metadata])
return store_docs_in_vector_db(docs, collection_name, overwrite) return store_docs_in_vector_db(docs, collection_name, overwrite=overwrite)
def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: def store_docs_in_vector_db(
docs, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
) -> bool:
log.info(f"store_docs_in_vector_db {docs} {collection_name}") log.info(f"store_docs_in_vector_db {docs} {collection_name}")
texts = [doc.page_content for doc in docs] texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs] metadatas = [{**doc.metadata, **(metadata if metadata else {})} for doc in docs]
# ChromaDB does not like datetime formats # ChromaDB does not like datetime formats
# for meta-data so convert them to string. # for meta-data so convert them to string.
@ -1237,13 +1241,21 @@ def process_doc(
data = loader.load() data = loader.load()
try: try:
result = store_data_in_vector_db(data, collection_name) result = store_data_in_vector_db(
data,
collection_name,
{
"file_id": form_data.file_id,
"name": file.meta.get("name", file.filename),
},
)
if result: if result:
return { return {
"status": True, "status": True,
"collection_name": collection_name, "collection_name": collection_name,
"known_type": known_type, "known_type": known_type,
"filename": file.meta.get("name", file.filename),
} }
except Exception as e: except Exception as e:
raise HTTPException( raise HTTPException(

View File

@ -58,6 +58,7 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
# replace filename with uuid # replace filename with uuid
id = str(uuid.uuid4()) id = str(uuid.uuid4())
name = filename
filename = f"{id}_{filename}" filename = f"{id}_{filename}"
file_path = f"{UPLOAD_DIR}/{filename}" file_path = f"{UPLOAD_DIR}/{filename}"
@ -73,6 +74,7 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
"id": id, "id": id,
"filename": filename, "filename": filename,
"meta": { "meta": {
"name": name,
"content_type": file.content_type, "content_type": file.content_type,
"size": len(contents), "size": len(contents),
"path": file_path, "path": file_path,

View File

@ -57,12 +57,12 @@
{#if document.source?.name} {#if document.source?.name}
<div class="text-sm dark:text-gray-400"> <div class="text-sm dark:text-gray-400">
<a <a
href={document?.source?.url href={document?.metadata?.file_id
? `${document?.source?.url}/content` ? `/api/v1/files/${document?.metadata?.file_id}/content`
: document.source.name} : document.source.name}
target="_blank" target="_blank"
> >
{document.source.name} {document?.metadata?.name ?? document.source.name}
</a> </a>
</div> </div>
{:else} {:else}

View File

@ -8,14 +8,16 @@
import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents'; import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents';
import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants'; import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
import { uploadDocToVectorDB } from '$lib/apis/rag'; import { processDocToVectorDB, uploadDocToVectorDB } from '$lib/apis/rag';
import { transformFileName } from '$lib/utils'; import { blobToFile, transformFileName } from '$lib/utils';
import Checkbox from '$lib/components/common/Checkbox.svelte'; import Checkbox from '$lib/components/common/Checkbox.svelte';
import EditDocModal from '$lib/components/documents/EditDocModal.svelte'; import EditDocModal from '$lib/components/documents/EditDocModal.svelte';
import AddFilesPlaceholder from '$lib/components/AddFilesPlaceholder.svelte'; import AddFilesPlaceholder from '$lib/components/AddFilesPlaceholder.svelte';
import AddDocModal from '$lib/components/documents/AddDocModal.svelte'; import AddDocModal from '$lib/components/documents/AddDocModal.svelte';
import { transcribeAudio } from '$lib/apis/audio';
import { uploadFile } from '$lib/apis/files';
const i18n = getContext('i18n'); const i18n = getContext('i18n');
@ -50,7 +52,28 @@
}; };
const uploadDoc = async (file) => { const uploadDoc = async (file) => {
const res = await uploadDocToVectorDB(localStorage.token, '', file).catch((error) => { console.log(file);
// Check if the file is an audio file and transcribe/convert it to text file
if (['audio/mpeg', 'audio/wav'].includes(file['type'])) {
const transcribeRes = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (transcribeRes) {
console.log(transcribeRes);
const blob = new Blob([transcribeRes.text], { type: 'text/plain' });
file = blobToFile(blob, `${file.name}.txt`);
}
}
// Upload the file to the server
const uploadedFile = await uploadFile(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
const res = await processDocToVectorDB(localStorage.token, uploadedFile.id).catch((error) => {
toast.error(error); toast.error(error);
return null; return null;
}); });