From c1ec604f21b1ae7a1eacf8f8fe34bbaf18c04131 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Tue, 9 Jan 2024 15:24:53 -0800 Subject: [PATCH] feat: rag md support --- backend/apps/rag/main.py | 15 +++++++++++++++ backend/requirements.txt | 1 + src/lib/components/chat/MessageInput.svelte | 6 +++++- src/lib/constants.ts | 1 + 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 0d6cc732e..a4776691f 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -19,6 +19,8 @@ from langchain_community.document_loaders import ( PyPDFLoader, CSVLoader, Docx2txtLoader, + UnstructuredWordDocumentLoader, + UnstructuredMarkdownLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -140,17 +142,27 @@ def store_doc( ): # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" + print(file.content_type) if file.content_type not in [ "application/pdf", "text/plain", "text/csv", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/octet-stream", ]: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, ) + if file.content_type == "application/octet-stream" and file.filename.split(".")[ + -1 + ] not in ["md"]: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, + ) + try: filename = file.filename file_path = f"{UPLOAD_DIR}/{filename}" @@ -175,6 +187,9 @@ def store_doc( loader = TextLoader(file_path) elif file.content_type == "text/csv": loader = CSVLoader(file_path) + elif file.content_type == "application/octet-stream": + if file.filename.split(".")[-1] == "md": + loader = UnstructuredMarkdownLoader(file_path) data = loader.load() result = store_data_in_vector_db(data, collection_name) diff --git a/backend/requirements.txt b/backend/requirements.txt index 06af0efc6..829a1ca61 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -22,6 +22,7 @@ chromadb sentence_transformers pypdf docx2txt +unstructured PyJWT pyjwt[crypto] diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index 72782da99..4df0ebb12 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -149,9 +149,13 @@ if (inputFiles && inputFiles.length > 0) { const file = inputFiles[0]; + console.log(file, file.name.split('.').at(-1)); if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) { reader.readAsDataURL(file); - } else if (SUPPORTED_FILE_TYPE.includes(file['type'])) { + } else if ( + SUPPORTED_FILE_TYPE.includes(file['type']) || + ['md'].includes(file.name.split('.').at(-1)) + ) { uploadDoc(file); } else { toast.error(`Unsupported File Type '${file['type']}'.`); diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 260e675ec..5d77834b3 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -14,6 +14,7 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16'; export const SUPPORTED_FILE_TYPE = [ 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'text/markdown', 'text/plain', 'text/csv' ];