From 784a6ec85e18b9b798fb7292acf9015beae7fada Mon Sep 17 00:00:00 2001 From: Doug Danat Date: Mon, 25 Mar 2024 09:50:53 +0100 Subject: [PATCH] include html langchain loader for RAG --- backend/apps/rag/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 48ca61666..163f1b0fa 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,6 +21,7 @@ from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, CSVLoader, + UnstructuredHTMLLoader, Docx2txtLoader, UnstructuredEPubLoader, UnstructuredWordDocumentLoader, @@ -402,6 +403,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str): loader = UnstructuredRSTLoader(file_path, mode="elements") elif file_ext == "xml": loader = UnstructuredXMLLoader(file_path) + elif file_ext in ["htm", "html"]: + loader = UnstructuredHTMLLoader(file_path) elif file_ext == "md": loader = UnstructuredMarkdownLoader(file_path) elif file_content_type == "application/epub+zip":