diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 4ceae2a80..e6bb02a40 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -23,6 +23,7 @@ from langchain_community.document_loaders import ( UnstructuredMarkdownLoader, UnstructuredXMLLoader, UnstructuredRSTLoader, + UnstructuredExcelLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -157,6 +158,9 @@ def store_doc( ] docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" known_doc_ext=["doc","docx"] + excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] + known_excel_ext=["xls", "xlsx"] + file_ext=file.filename.split(".")[-1].lower() known_type=True @@ -179,6 +183,8 @@ def store_doc( loader = Docx2txtLoader(file_path) elif file_ext=="csv": loader = CSVLoader(file_path) + elif (file.content_type in excel_types or file_ext in known_excel_ext): + loader = UnstructuredExcelLoader(file_path) elif file_ext=="rst": loader = UnstructuredRSTLoader(file_path, mode="elements") elif file_ext in text_xml: diff --git a/backend/requirements.txt b/backend/requirements.txt index 76a208249..07ea0ea3f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -28,4 +28,9 @@ markdown PyJWT pyjwt[crypto] -black \ No newline at end of file +black + +pandas +openpyxl +pyxlsb +xlrd \ No newline at end of file