Allow any file to be used for RAG.

Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file.
This commit is contained in:
Marclass 2024-01-18 20:41:14 -07:00
parent 6070e6bcd1
commit aa1d386042
3 changed files with 27 additions and 43 deletions

View File

@ -144,37 +144,21 @@ def store_doc(
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
print(file.content_type)
if file.content_type not in [
"application/pdf",
"text/plain",
"text/csv",
"text/xml",
"text/x-python",
"text/css",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/octet-stream",
"application/x-javascript",
]:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
)
text_xml=["text/xml"]
text_xml=["xml"]
octet_markdown=["md"]
octet_plain=[
known_source_ext=[
"go", "py", "java", "sh", "bat", "ps1", "cmd", "js",
"css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
"pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
"hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
"rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
]
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
known_doc_ext=["doc","docx"]
file_ext=file.filename.split(".")[-1].lower()
if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
)
known_type=True
try:
filename = file.filename
file_path = f"{UPLOAD_DIR}/{filename}"
@ -188,27 +172,22 @@ def store_doc(
collection_name = calculate_sha256(f)[:63]
f.close()
if file.content_type == "application/pdf":
if file_ext=="pdf":
loader = PyPDFLoader(file_path)
elif (
file.content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
elif (file.content_type ==docx_type or file_ext in known_doc_ext):
loader = Docx2txtLoader(file_path)
elif file.content_type == "text/csv":
elif file_ext=="csv":
loader = CSVLoader(file_path)
elif file.content_type in text_xml:
elif file_ext in text_xml:
loader=UnstructuredXMLLoader(file_path)
elif file.content_type == "text/plain" or file.content_type.find("text/")>=0:
elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
loader = TextLoader(file_path)
elif file.content_type == "application/octet-stream":
if file_ext in octet_markdown:
loader = UnstructuredMarkdownLoader(file_path)
if file_ext in octet_plain:
loader = TextLoader(file_path)
elif file.content_type == "application/x-javascript":
elif file_ext in octet_markdown:
loader = UnstructuredMarkdownLoader(file_path)
else:
loader = TextLoader(file_path)
known_type=False
data = loader.load()
result = store_data_in_vector_db(data, collection_name)
@ -218,6 +197,7 @@ def store_doc(
"status": True,
"collection_name": collection_name,
"filename": filename,
"known_type":known_type,
}
else:
raise HTTPException(

View File

@ -173,7 +173,8 @@
) {
uploadDoc(file);
} else {
toast.error(`Unsupported File Type '${file['type']}'.`);
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
}
} else {
toast.error(`File not found.`);
@ -308,8 +309,9 @@
uploadDoc(file);
filesInputElement.value = '';
} else {
toast.error(`Unsupported File Type '${file['type']}'.`);
inputFiles = null;
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
filesInputElement.value = '';
}
} else {
toast.error(`File not found.`);

View File

@ -73,7 +73,8 @@
) {
uploadDoc(file);
} else {
toast.error(`Unsupported File Type '${file['type']}'.`);
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
}
} else {
toast.error(`File not found.`);
@ -153,7 +154,8 @@
) {
uploadDoc(file);
} else {
toast.error(`Unsupported File Type '${file['type']}'.`);
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
}
inputFiles = null;