Merge pull request #2923 from mindspawn/outlook-msg

Support Outlook Message File Format
This commit is contained in:
Timothy Jaeryang Baek 2024-06-07 21:50:03 -07:00 committed by GitHub
commit dbde628141
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 17 additions and 2 deletions

View File

@ -9,6 +9,7 @@ from fastapi import (
)
from fastapi.middleware.cors import CORSMiddleware
import os, shutil, logging, re
from datetime import datetime
from pathlib import Path
from typing import List, Union, Sequence
@ -30,6 +31,7 @@ from langchain_community.document_loaders import (
UnstructuredExcelLoader,
UnstructuredPowerPointLoader,
YoutubeLoader,
OutlookMessageLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
@ -879,6 +881,13 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b
texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs]
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for metadata in metadatas:
for key, value in metadata.items():
if isinstance(value, datetime):
metadata[key] = str(value)
try:
if overwrite:
for collection in CHROMA_CLIENT.list_collections():
@ -965,6 +974,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"swift",
"vue",
"svelte",
"msg",
]
if file_ext == "pdf":
@ -999,6 +1009,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
] or file_ext in ["ppt", "pptx"]:
loader = UnstructuredPowerPointLoader(file_path)
elif file_ext == "msg":
loader = OutlookMessageLoader(file_path)
elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):

View File

@ -57,3 +57,5 @@ black==24.4.2
langfuse==2.33.0
youtube-transcript-api==0.6.2
pytube==15.0.0
extract_msg

View File

@ -89,7 +89,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
'xls',
'xlsx',
'pptx',
'ppt'
'ppt',
'msg'
];
// Source: https://kit.svelte.dev/docs/modules#$env-static-public