From 322db31dc9d40a54aeb9f98cd3b818a6ad08b8be Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Mon, 20 May 2024 07:22:43 -0700 Subject: [PATCH] fix: rag --- backend/apps/rag/main.py | 6 ++++++ backend/requirements.txt | 1 + src/lib/constants.ts | 4 +++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 9a1a0c13e..f08d81a3b 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -28,6 +28,7 @@ from langchain_community.document_loaders import ( UnstructuredXMLLoader, UnstructuredRSTLoader, UnstructuredExcelLoader, + UnstructuredPowerPointLoader, YoutubeLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -768,6 +769,11 @@ def get_loader(filename: str, file_content_type: str, file_path: str): "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ] or file_ext in ["xls", "xlsx"]: loader = UnstructuredExcelLoader(file_path) + elif file_content_type in [ + "application/vnd.ms-powerpoint", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ] or file_ext in ["ppt", "pptx"]: + loader = UnstructuredPowerPointLoader(file_path) elif file_ext in known_source_ext or ( file_content_type and file_content_type.find("text/") >= 0 ): diff --git a/backend/requirements.txt b/backend/requirements.txt index c8b699447..a82da1966 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -35,6 +35,7 @@ chromadb==0.4.24 sentence-transformers==2.7.0 pypdf==4.2.0 docx2txt==0.8 +python-pptx==0.6.23 unstructured==0.11.8 Markdown==3.6 pypandoc==1.13 diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 62ac2b8c3..a305bea7c 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -86,7 +86,9 @@ export const SUPPORTED_FILE_EXTENSIONS = [ 'csv', 'txt', 'xls', - 'xlsx' + 'xlsx', + 'pptx', + 'ppt' ]; // Source: https://kit.svelte.dev/docs/modules#$env-static-public