refac: process docs dir

This commit is contained in:
Timothy J. Baek 2024-10-04 17:22:00 -07:00
parent 9ad5ffb8c1
commit a6c797d4c2
5 changed files with 79 additions and 95 deletions

View File

@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import (
query_doc_with_hybrid_search,
)
from open_webui.apps.webui.models.documents import DocumentForm, Documents
from open_webui.apps.webui.models.files import Files
from open_webui.config import (
BRAVE_SEARCH_API_KEY,
@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
)
@app.get("/process/dir")
def process_docs_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
try:
if path.is_file() and not path.name.startswith("."):
tags = extract_folders_after_data_docs(path)
filename = path.name
file_content_type = mimetypes.guess_type(path)
with open(path, "rb") as f:
collection_name = calculate_sha256(f)[:63]
loader = Loader(
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(filename, file_content_type[0], str(path))
try:
result = save_docs_to_vector_db(docs, collection_name)
if result:
sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename)
if doc is None:
doc = Documents.insert_new_doc(
user.id,
DocumentForm(
**{
"name": sanitized_filename,
"title": filename,
"collection_name": collection_name,
"filename": filename,
"content": (
json.dumps(
{
"tags": list(
map(
lambda name: {"name": name},
tags,
)
)
}
)
if len(tags)
else "{}"
),
}
),
)
except Exception as e:
log.exception(e)
pass
except Exception as e:
log.exception(e)
return True
class QueryDocForm(BaseModel):
collection_name: str
query: str

View File

@ -5,17 +5,21 @@ import uuid
from pathlib import Path
from typing import Optional
from pydantic import BaseModel
import mimetypes
from open_webui.apps.webui.models.files import FileForm, FileModel, Files
from open_webui.apps.retrieval.main import process_file, ProcessFileForm
from open_webui.config import UPLOAD_DIR
from open_webui.constants import ERROR_MESSAGES
from open_webui.config import UPLOAD_DIR, DOCS_DIR
from open_webui.env import SRC_LOG_LEVELS
from open_webui.constants import ERROR_MESSAGES
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
from fastapi.responses import FileResponse, StreamingResponse
from open_webui.utils.utils import get_admin_user, get_verified_user
log = logging.getLogger(__name__)
@ -86,6 +90,51 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
)
@router.post("/upload/dir")
def upload_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
if path.is_file() and not path.name.startswith("."):
try:
log.debug(f"Processing file from path: {path}")
filename = path.name
file_content_type = mimetypes.guess_type(path)
# replace filename with uuid
id = str(uuid.uuid4())
name = filename
contents = path.read_bytes()
file_path = str(path)
file = Files.insert_new_file(
user.id,
FileForm(
**{
"id": id,
"filename": filename,
"meta": {
"name": name,
"content_type": file_content_type,
"size": len(contents),
"path": file_path,
},
}
),
)
try:
process_file(ProcessFileForm(file_id=id))
log.debug(f"File processed: {path}, {file.id}")
except Exception as e:
log.exception(e)
log.error(f"Error processing file: {file.id}")
except Exception as e:
log.exception(e)
pass
return True
############################
# List Files
############################

View File

@ -30,6 +30,32 @@ export const uploadFile = async (token: string, file: File) => {
return res;
};
export const uploadDir = async (token: string) => {
let error = null;
const res = await fetch(`${WEBUI_API_BASE_URL}/files/upload/dir`, {
method: 'POST',
headers: {
Accept: 'application/json',
authorization: `Bearer ${token}`
}
})
.then(async (res) => {
if (!res.ok) throw await res.json();
return res.json();
})
.catch((err) => {
error = err.detail;
return null;
});
if (error) {
throw error;
}
return res;
};
export const getFiles = async (token: string = '') => {
let error = null;

View File

@ -342,32 +342,6 @@ export const processFile = async (
return res;
};
export const processDocsDir = async (token: string) => {
let error = null;
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/dir`, {
method: 'GET',
headers: {
Accept: 'application/json',
authorization: `Bearer ${token}`
}
})
.then(async (res) => {
if (!res.ok) throw await res.json();
return res.json();
})
.catch((err) => {
error = err.detail;
return null;
});
if (error) {
throw error;
}
return res;
};
export const processYoutubeVideo = async (token: string, url: string) => {
let error = null;

View File

@ -7,7 +7,6 @@
import {
getQuerySettings,
processDocsDir,
updateQuerySettings,
resetVectorDB,
getEmbeddingConfig,
@ -21,7 +20,7 @@
import { knowledge, models } from '$lib/stores';
import { getKnowledgeItems } from '$lib/apis/knowledge';
import { deleteAllFiles, deleteFileById } from '$lib/apis/files';
import { uploadDir, deleteAllFiles, deleteFileById } from '$lib/apis/files';
import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
@ -65,11 +64,10 @@
const scanHandler = async () => {
scanDirLoading = true;
const res = await processDocsDir(localStorage.token);
const res = await uploadDir(localStorage.token);
scanDirLoading = false;
if (res) {
await knowledge.set(await getKnowledgeItems(localStorage.token));
toast.success($i18n.t('Scan complete!'));
}
};