From c1ec604f21b1ae7a1eacf8f8fe34bbaf18c04131 Mon Sep 17 00:00:00 2001
From: "Timothy J. Baek" <timothyjrbeck@gmail.com>
Date: Tue, 9 Jan 2024 15:24:53 -0800
Subject: [PATCH] feat: rag md support

---
 backend/apps/rag/main.py                    | 15 +++++++++++++++
 backend/requirements.txt                    |  1 +
 src/lib/components/chat/MessageInput.svelte |  6 +++++-
 src/lib/constants.ts                        |  1 +
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py
index 0d6cc732e..a4776691f 100644
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -19,6 +19,8 @@ from langchain_community.document_loaders import (
     PyPDFLoader,
     CSVLoader,
     Docx2txtLoader,
+    UnstructuredWordDocumentLoader,
+    UnstructuredMarkdownLoader,
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
@@ -140,17 +142,27 @@ def store_doc(
 ):
     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
 
+    print(file.content_type)
     if file.content_type not in [
         "application/pdf",
         "text/plain",
         "text/csv",
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/octet-stream",
     ]:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
         )
 
+    if file.content_type == "application/octet-stream" and file.filename.split(".")[
+        -1
+    ] not in ["md"]:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
+        )
+
     try:
         filename = file.filename
         file_path = f"{UPLOAD_DIR}/{filename}"
@@ -175,6 +187,9 @@ def store_doc(
             loader = TextLoader(file_path)
         elif file.content_type == "text/csv":
             loader = CSVLoader(file_path)
+        elif file.content_type == "application/octet-stream":
+            if file.filename.split(".")[-1] == "md":
+                loader = UnstructuredMarkdownLoader(file_path)
 
         data = loader.load()
         result = store_data_in_vector_db(data, collection_name)
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 06af0efc6..829a1ca61 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -22,6 +22,7 @@ chromadb
 sentence_transformers
 pypdf
 docx2txt
+unstructured
 
 PyJWT
 pyjwt[crypto]
diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte
index 72782da99..4df0ebb12 100644
--- a/src/lib/components/chat/MessageInput.svelte
+++ b/src/lib/components/chat/MessageInput.svelte
@@ -149,9 +149,13 @@
 
 				if (inputFiles && inputFiles.length > 0) {
 					const file = inputFiles[0];
+					console.log(file, file.name.split('.').at(-1));
 					if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) {
 						reader.readAsDataURL(file);
-					} else if (SUPPORTED_FILE_TYPE.includes(file['type'])) {
+					} else if (
+						SUPPORTED_FILE_TYPE.includes(file['type']) ||
+						['md'].includes(file.name.split('.').at(-1))
+					) {
 						uploadDoc(file);
 					} else {
 						toast.error(`Unsupported File Type '${file['type']}'.`);
diff --git a/src/lib/constants.ts b/src/lib/constants.ts
index 260e675ec..5d77834b3 100644
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@@ -14,6 +14,7 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16';
 export const SUPPORTED_FILE_TYPE = [
 	'application/pdf',
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+	'text/markdown',
 	'text/plain',
 	'text/csv'
 ];