refac: process docs dir

This commit is contained in:
Timothy J. Baek 2024-10-04 17:22:00 -07:00
parent 9ad5ffb8c1
commit a6c797d4c2
5 changed files with 79 additions and 95 deletions
backend/open_webui/apps
retrieval
webui/routers
src/lib
apis
files
retrieval
components/admin/Settings

View File

@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import (
query_doc_with_hybrid_search, query_doc_with_hybrid_search,
) )
from open_webui.apps.webui.models.documents import DocumentForm, Documents
from open_webui.apps.webui.models.files import Files from open_webui.apps.webui.models.files import Files
from open_webui.config import ( from open_webui.config import (
BRAVE_SEARCH_API_KEY, BRAVE_SEARCH_API_KEY,
@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
) )
@app.get("/process/dir")
def process_docs_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
try:
if path.is_file() and not path.name.startswith("."):
tags = extract_folders_after_data_docs(path)
filename = path.name
file_content_type = mimetypes.guess_type(path)
with open(path, "rb") as f:
collection_name = calculate_sha256(f)[:63]
loader = Loader(
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(filename, file_content_type[0], str(path))
try:
result = save_docs_to_vector_db(docs, collection_name)
if result:
sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename)
if doc is None:
doc = Documents.insert_new_doc(
user.id,
DocumentForm(
**{
"name": sanitized_filename,
"title": filename,
"collection_name": collection_name,
"filename": filename,
"content": (
json.dumps(
{
"tags": list(
map(
lambda name: {"name": name},
tags,
)
)
}
)
if len(tags)
else "{}"
),
}
),
)
except Exception as e:
log.exception(e)
pass
except Exception as e:
log.exception(e)
return True
class QueryDocForm(BaseModel): class QueryDocForm(BaseModel):
collection_name: str collection_name: str
query: str query: str

View File

@ -5,17 +5,21 @@ import uuid
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from pydantic import BaseModel from pydantic import BaseModel
import mimetypes
from open_webui.apps.webui.models.files import FileForm, FileModel, Files from open_webui.apps.webui.models.files import FileForm, FileModel, Files
from open_webui.apps.retrieval.main import process_file, ProcessFileForm from open_webui.apps.retrieval.main import process_file, ProcessFileForm
from open_webui.config import UPLOAD_DIR from open_webui.config import UPLOAD_DIR, DOCS_DIR
from open_webui.constants import ERROR_MESSAGES
from open_webui.env import SRC_LOG_LEVELS from open_webui.env import SRC_LOG_LEVELS
from open_webui.constants import ERROR_MESSAGES
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
from fastapi.responses import FileResponse, StreamingResponse from fastapi.responses import FileResponse, StreamingResponse
from open_webui.utils.utils import get_admin_user, get_verified_user from open_webui.utils.utils import get_admin_user, get_verified_user
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -86,6 +90,51 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
) )
@router.post("/upload/dir")
def upload_dir(user=Depends(get_admin_user)):
for path in Path(DOCS_DIR).rglob("./**/*"):
if path.is_file() and not path.name.startswith("."):
try:
log.debug(f"Processing file from path: {path}")
filename = path.name
file_content_type = mimetypes.guess_type(path)
# replace filename with uuid
id = str(uuid.uuid4())
name = filename
contents = path.read_bytes()
file_path = str(path)
file = Files.insert_new_file(
user.id,
FileForm(
**{
"id": id,
"filename": filename,
"meta": {
"name": name,
"content_type": file_content_type,
"size": len(contents),
"path": file_path,
},
}
),
)
try:
process_file(ProcessFileForm(file_id=id))
log.debug(f"File processed: {path}, {file.id}")
except Exception as e:
log.exception(e)
log.error(f"Error processing file: {file.id}")
except Exception as e:
log.exception(e)
pass
return True
############################ ############################
# List Files # List Files
############################ ############################

View File

@ -30,6 +30,32 @@ export const uploadFile = async (token: string, file: File) => {
return res; return res;
}; };
export const uploadDir = async (token: string) => {
let error = null;
const res = await fetch(`${WEBUI_API_BASE_URL}/files/upload/dir`, {
method: 'POST',
headers: {
Accept: 'application/json',
authorization: `Bearer ${token}`
}
})
.then(async (res) => {
if (!res.ok) throw await res.json();
return res.json();
})
.catch((err) => {
error = err.detail;
return null;
});
if (error) {
throw error;
}
return res;
};
export const getFiles = async (token: string = '') => { export const getFiles = async (token: string = '') => {
let error = null; let error = null;

View File

@ -342,32 +342,6 @@ export const processFile = async (
return res; return res;
}; };
export const processDocsDir = async (token: string) => {
let error = null;
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/dir`, {
method: 'GET',
headers: {
Accept: 'application/json',
authorization: `Bearer ${token}`
}
})
.then(async (res) => {
if (!res.ok) throw await res.json();
return res.json();
})
.catch((err) => {
error = err.detail;
return null;
});
if (error) {
throw error;
}
return res;
};
export const processYoutubeVideo = async (token: string, url: string) => { export const processYoutubeVideo = async (token: string, url: string) => {
let error = null; let error = null;

View File

@ -7,7 +7,6 @@
import { import {
getQuerySettings, getQuerySettings,
processDocsDir,
updateQuerySettings, updateQuerySettings,
resetVectorDB, resetVectorDB,
getEmbeddingConfig, getEmbeddingConfig,
@ -21,7 +20,7 @@
import { knowledge, models } from '$lib/stores'; import { knowledge, models } from '$lib/stores';
import { getKnowledgeItems } from '$lib/apis/knowledge'; import { getKnowledgeItems } from '$lib/apis/knowledge';
import { deleteAllFiles, deleteFileById } from '$lib/apis/files'; import { uploadDir, deleteAllFiles, deleteFileById } from '$lib/apis/files';
import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte'; import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte'; import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
@ -65,11 +64,10 @@
const scanHandler = async () => { const scanHandler = async () => {
scanDirLoading = true; scanDirLoading = true;
const res = await processDocsDir(localStorage.token); const res = await uploadDir(localStorage.token);
scanDirLoading = false; scanDirLoading = false;
if (res) { if (res) {
await knowledge.set(await getKnowledgeItems(localStorage.token));
toast.success($i18n.t('Scan complete!')); toast.success($i18n.t('Scan complete!'));
} }
}; };