From 4ecc1c06d3ae3f79d1daac556e6743d09787db75 Mon Sep 17 00:00:00 2001 From: mindspawn Date: Fri, 7 Jun 2024 21:18:04 -0700 Subject: [PATCH 1/6] Update main.py --- backend/apps/rag/main.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index d405ef0b4..9df46051a 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -9,6 +9,7 @@ from fastapi import ( ) from fastapi.middleware.cors import CORSMiddleware import os, shutil, logging, re +from datetime import datetime from pathlib import Path from typing import List, Union, Sequence @@ -30,6 +31,7 @@ from langchain_community.document_loaders import ( UnstructuredExcelLoader, UnstructuredPowerPointLoader, YoutubeLoader, + OutlookMessageLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -879,6 +881,13 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b texts = [doc.page_content for doc in docs] metadatas = [doc.metadata for doc in docs] + # ChromaDB does not like datetime formats + # for meta-data so convert them to string. + for metadata in metadatas: + for key, value in metadata.items(): + if isinstance(value, datetime): + metadata[key] = str(value) + try: if overwrite: for collection in CHROMA_CLIENT.list_collections(): @@ -965,6 +974,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): "swift", "vue", "svelte", + "msg" ] if file_ext == "pdf": @@ -999,6 +1009,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str): "application/vnd.openxmlformats-officedocument.presentationml.presentation", ] or file_ext in ["ppt", "pptx"]: loader = UnstructuredPowerPointLoader(file_path) + elif file_ext == "msg": + loader = OutlookMessageLoader(file_path) elif file_ext in known_source_ext or ( file_content_type and file_content_type.find("text/") >= 0 ): From f69bc57fed98b4c573836e2a31c354173fc08108 Mon Sep 17 00:00:00 2001 From: mindspawn Date: Fri, 7 Jun 2024 21:18:35 -0700 Subject: [PATCH 2/6] Update requirements.txt --- backend/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 7a3668428..57ce2cccf 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -56,4 +56,6 @@ PyJWT[crypto]==2.8.0 black==24.4.2 langfuse==2.33.0 youtube-transcript-api==0.6.2 -pytube==15.0.0 \ No newline at end of file +pytube==15.0.0 + +extract_msg From cff8534f330b3ed5c81436e438d447fe8e4d4187 Mon Sep 17 00:00:00 2001 From: mindspawn Date: Fri, 7 Jun 2024 21:19:46 -0700 Subject: [PATCH 3/6] Update constants.ts --- src/lib/constants.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 163309802..5fc1e7b19 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -89,7 +89,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [ 'xls', 'xlsx', 'pptx', - 'ppt' + 'ppt', + 'msg' ]; // Source: https://kit.svelte.dev/docs/modules#$env-static-public From d5a2a8a88023d249b2e7e2fbb2f7b8c4e814bd14 Mon Sep 17 00:00:00 2001 From: mindspawn Date: Fri, 7 Jun 2024 21:20:59 -0700 Subject: [PATCH 4/6] Update Dockerfile --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2bd05e6ba..498499aa1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -133,7 +133,8 @@ RUN pip3 install uv && \ fi; \ chown -R $UID:$GID /app/backend/data/ - +# Dependency for email message extraction +RUN pip3 install extract_msg # copy embedding weight from build # RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 From 6f9148ac4ca12830b00c1cc475b9c7be5bfd4960 Mon Sep 17 00:00:00 2001 From: mindspawn Date: Fri, 7 Jun 2024 21:41:30 -0700 Subject: [PATCH 5/6] Update main.py --- backend/apps/rag/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 9df46051a..8816321b3 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -974,7 +974,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): "swift", "vue", "svelte", - "msg" + "msg", ] if file_ext == "pdf": From 2412f31ed95fbfa28919ab789b676c7bcfda787a Mon Sep 17 00:00:00 2001 From: mindspawn Date: Fri, 7 Jun 2024 21:47:50 -0700 Subject: [PATCH 6/6] Update Dockerfile to revert duplicated pip install --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 498499aa1..2bd05e6ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -133,8 +133,7 @@ RUN pip3 install uv && \ fi; \ chown -R $UID:$GID /app/backend/data/ -# Dependency for email message extraction -RUN pip3 install extract_msg + # copy embedding weight from build # RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2