From 8bfda730d9dd99eae2147bcb8207efe09016d165 Mon Sep 17 00:00:00 2001 From: Marclass Date: Tue, 23 Jan 2024 14:03:22 -0700 Subject: [PATCH 1/3] add excel document support --- backend/apps/rag/main.py | 6 ++++++ backend/requirements.txt | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 4ceae2a80..e6bb02a40 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -23,6 +23,7 @@ from langchain_community.document_loaders import ( UnstructuredMarkdownLoader, UnstructuredXMLLoader, UnstructuredRSTLoader, + UnstructuredExcelLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -157,6 +158,9 @@ def store_doc( ] docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" known_doc_ext=["doc","docx"] + excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] + known_excel_ext=["xls", "xlsx"] + file_ext=file.filename.split(".")[-1].lower() known_type=True @@ -179,6 +183,8 @@ def store_doc( loader = Docx2txtLoader(file_path) elif file_ext=="csv": loader = CSVLoader(file_path) + elif (file.content_type in excel_types or file_ext in known_excel_ext): + loader = UnstructuredExcelLoader(file_path) elif file_ext=="rst": loader = UnstructuredRSTLoader(file_path, mode="elements") elif file_ext in text_xml: diff --git a/backend/requirements.txt b/backend/requirements.txt index 76a208249..07ea0ea3f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -28,4 +28,9 @@ markdown PyJWT pyjwt[crypto] -black \ No newline at end of file +black + +pandas +openpyxl +pyxlsb +xlrd \ No newline at end of file From 1e932d91cb527d00382362ee146007c8dff15ac5 Mon Sep 17 00:00:00 2001 From: Marclass Date: Tue, 23 Jan 2024 14:04:40 -0700 Subject: [PATCH 2/3] Update constants.ts add excel file ext --- src/lib/constants.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 5e7363137..b00d62804 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -30,7 +30,7 @@ export const SUPPORTED_FILE_EXTENSIONS = [ 'pl', 'pm', 'r', 'dart', 'dockerfile', 'env', 'php', 'hs', 'hsc', 'lua', 'nginxconf', 'conf', 'm', 'mm', 'plsql', 'perl', 'rb', 'rs', 'db2', 'scala', 'bash', 'swift', 'vue', 'svelte', - 'doc','docx', 'pdf', 'csv', 'txt' + 'doc','docx', 'pdf', 'csv', 'txt', 'xls', 'xlsx' ]; // Source: https://kit.svelte.dev/docs/modules#$env-static-public From 4e468dc58cd71392d92539230743bde4256598bc Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Thu, 25 Jan 2024 00:24:49 -0800 Subject: [PATCH 3/3] refac --- backend/apps/rag/main.py | 125 ++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 42 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index ffa73d000..6da870ea7 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -138,6 +138,87 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)): ) +def get_loader(file, file_path): + file_ext = file.filename.split(".")[-1].lower() + known_type = True + + known_source_ext = [ + "go", + "py", + "java", + "sh", + "bat", + "ps1", + "cmd", + "js", + "ts", + "css", + "cpp", + "hpp", + "h", + "c", + "cs", + "sql", + "log", + "ini", + "pl", + "pm", + "r", + "dart", + "dockerfile", + "env", + "php", + "hs", + "hsc", + "lua", + "nginxconf", + "conf", + "m", + "mm", + "plsql", + "perl", + "rb", + "rs", + "db2", + "scala", + "bash", + "swift", + "vue", + "svelte", + ] + + if file_ext == "pdf": + loader = PyPDFLoader(file_path) + elif file_ext == "csv": + loader = CSVLoader(file_path) + elif file_ext == "rst": + loader = UnstructuredRSTLoader(file_path, mode="elements") + elif file_ext == "xml": + loader = UnstructuredXMLLoader(file_path) + elif file_ext == "md": + loader = UnstructuredMarkdownLoader(file_path) + elif file.content_type == "application/epub+zip": + loader = UnstructuredEPubLoader(file_path) + elif ( + file.content_type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + or file_ext in ["doc", "docx"] + ): + loader = Docx2txtLoader(file_path) + elif file.content_type in [ + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ] or file_ext in ["xls", "xlsx"]: + loader = UnstructuredExcelLoader(file_path) + elif file_ext in known_source_ext or file.content_type.find("text/") >= 0: + loader = TextLoader(file_path) + else: + loader = TextLoader(file_path) + known_type = False + + return loader, known_type + + @app.post("/doc") def store_doc( collection_name: Optional[str] = Form(None), @@ -147,24 +228,6 @@ def store_doc( # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" print(file.content_type) - - text_xml=["xml"] - octet_markdown=["md"] - known_source_ext=[ - "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", "ts", - "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", - "pl", "pm", "r", "dart", "dockerfile", "env", "php", "hs", - "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", - "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" - ] - docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" - known_doc_ext=["doc","docx"] - excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] - known_excel_ext=["xls", "xlsx"] - - file_ext=file.filename.split(".")[-1].lower() - known_type=True - try: filename = file.filename file_path = f"{UPLOAD_DIR}/{filename}" @@ -178,29 +241,7 @@ def store_doc( collection_name = calculate_sha256(f)[:63] f.close() - if file_ext=="pdf": - loader = PyPDFLoader(file_path) - elif (file.content_type ==docx_type or file_ext in known_doc_ext): - loader = Docx2txtLoader(file_path) - elif file_ext=="csv": - loader = CSVLoader(file_path) - elif (file.content_type in excel_types or file_ext in known_excel_ext): - loader = UnstructuredExcelLoader(file_path) - elif file_ext=="rst": - loader = UnstructuredRSTLoader(file_path, mode="elements") - elif file_ext in text_xml: - loader=UnstructuredXMLLoader(file_path) - elif file_ext in known_source_ext or file.content_type.find("text/")>=0: - loader = TextLoader(file_path) - elif file_ext in octet_markdown: - loader = UnstructuredMarkdownLoader(file_path) - elif file.content_type == "application/epub+zip": - loader = UnstructuredEPubLoader(file_path) - else: - loader = TextLoader(file_path) - known_type=False - - + loader, known_type = get_loader(file, file_path) data = loader.load() result = store_data_in_vector_db(data, collection_name) @@ -209,7 +250,7 @@ def store_doc( "status": True, "collection_name": collection_name, "filename": filename, - "known_type":known_type, + "known_type": known_type, } else: raise HTTPException(