mirror of
https://github.com/open-webui/open-webui
synced 2025-03-22 22:07:15 +00:00
refac: process docs dir
This commit is contained in:
parent
9ad5ffb8c1
commit
a6c797d4c2
@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import (
|
||||
query_doc_with_hybrid_search,
|
||||
)
|
||||
|
||||
from open_webui.apps.webui.models.documents import DocumentForm, Documents
|
||||
from open_webui.apps.webui.models.files import Files
|
||||
from open_webui.config import (
|
||||
BRAVE_SEARCH_API_KEY,
|
||||
@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
|
||||
)
|
||||
|
||||
|
||||
@app.get("/process/dir")
|
||||
def process_docs_dir(user=Depends(get_admin_user)):
|
||||
for path in Path(DOCS_DIR).rglob("./**/*"):
|
||||
try:
|
||||
if path.is_file() and not path.name.startswith("."):
|
||||
tags = extract_folders_after_data_docs(path)
|
||||
filename = path.name
|
||||
file_content_type = mimetypes.guess_type(path)
|
||||
|
||||
with open(path, "rb") as f:
|
||||
collection_name = calculate_sha256(f)[:63]
|
||||
|
||||
loader = Loader(
|
||||
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
|
||||
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
||||
)
|
||||
docs = loader.load(filename, file_content_type[0], str(path))
|
||||
|
||||
try:
|
||||
result = save_docs_to_vector_db(docs, collection_name)
|
||||
|
||||
if result:
|
||||
sanitized_filename = sanitize_filename(filename)
|
||||
doc = Documents.get_doc_by_name(sanitized_filename)
|
||||
|
||||
if doc is None:
|
||||
doc = Documents.insert_new_doc(
|
||||
user.id,
|
||||
DocumentForm(
|
||||
**{
|
||||
"name": sanitized_filename,
|
||||
"title": filename,
|
||||
"collection_name": collection_name,
|
||||
"filename": filename,
|
||||
"content": (
|
||||
json.dumps(
|
||||
{
|
||||
"tags": list(
|
||||
map(
|
||||
lambda name: {"name": name},
|
||||
tags,
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
if len(tags)
|
||||
else "{}"
|
||||
),
|
||||
}
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class QueryDocForm(BaseModel):
|
||||
collection_name: str
|
||||
query: str
|
||||
|
@ -5,17 +5,21 @@ import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
import mimetypes
|
||||
|
||||
|
||||
from open_webui.apps.webui.models.files import FileForm, FileModel, Files
|
||||
from open_webui.apps.retrieval.main import process_file, ProcessFileForm
|
||||
|
||||
from open_webui.config import UPLOAD_DIR
|
||||
from open_webui.constants import ERROR_MESSAGES
|
||||
from open_webui.config import UPLOAD_DIR, DOCS_DIR
|
||||
from open_webui.env import SRC_LOG_LEVELS
|
||||
from open_webui.constants import ERROR_MESSAGES
|
||||
|
||||
|
||||
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
|
||||
from fastapi.responses import FileResponse, StreamingResponse
|
||||
|
||||
|
||||
from open_webui.utils.utils import get_admin_user, get_verified_user
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@ -86,6 +90,51 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
|
||||
)
|
||||
|
||||
|
||||
@router.post("/upload/dir")
|
||||
def upload_dir(user=Depends(get_admin_user)):
|
||||
for path in Path(DOCS_DIR).rglob("./**/*"):
|
||||
if path.is_file() and not path.name.startswith("."):
|
||||
try:
|
||||
log.debug(f"Processing file from path: {path}")
|
||||
|
||||
filename = path.name
|
||||
file_content_type = mimetypes.guess_type(path)
|
||||
|
||||
# replace filename with uuid
|
||||
id = str(uuid.uuid4())
|
||||
name = filename
|
||||
|
||||
contents = path.read_bytes()
|
||||
file_path = str(path)
|
||||
|
||||
file = Files.insert_new_file(
|
||||
user.id,
|
||||
FileForm(
|
||||
**{
|
||||
"id": id,
|
||||
"filename": filename,
|
||||
"meta": {
|
||||
"name": name,
|
||||
"content_type": file_content_type,
|
||||
"size": len(contents),
|
||||
"path": file_path,
|
||||
},
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
process_file(ProcessFileForm(file_id=id))
|
||||
log.debug(f"File processed: {path}, {file.id}")
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
log.error(f"Error processing file: {file.id}")
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
############################
|
||||
# List Files
|
||||
############################
|
||||
|
@ -30,6 +30,32 @@ export const uploadFile = async (token: string, file: File) => {
|
||||
return res;
|
||||
};
|
||||
|
||||
export const uploadDir = async (token: string) => {
|
||||
let error = null;
|
||||
|
||||
const res = await fetch(`${WEBUI_API_BASE_URL}/files/upload/dir`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Accept: 'application/json',
|
||||
authorization: `Bearer ${token}`
|
||||
}
|
||||
})
|
||||
.then(async (res) => {
|
||||
if (!res.ok) throw await res.json();
|
||||
return res.json();
|
||||
})
|
||||
.catch((err) => {
|
||||
error = err.detail;
|
||||
return null;
|
||||
});
|
||||
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
return res;
|
||||
};
|
||||
|
||||
export const getFiles = async (token: string = '') => {
|
||||
let error = null;
|
||||
|
||||
|
@ -342,32 +342,6 @@ export const processFile = async (
|
||||
return res;
|
||||
};
|
||||
|
||||
export const processDocsDir = async (token: string) => {
|
||||
let error = null;
|
||||
|
||||
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/dir`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
Accept: 'application/json',
|
||||
authorization: `Bearer ${token}`
|
||||
}
|
||||
})
|
||||
.then(async (res) => {
|
||||
if (!res.ok) throw await res.json();
|
||||
return res.json();
|
||||
})
|
||||
.catch((err) => {
|
||||
error = err.detail;
|
||||
return null;
|
||||
});
|
||||
|
||||
if (error) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
return res;
|
||||
};
|
||||
|
||||
export const processYoutubeVideo = async (token: string, url: string) => {
|
||||
let error = null;
|
||||
|
||||
|
@ -7,7 +7,6 @@
|
||||
|
||||
import {
|
||||
getQuerySettings,
|
||||
processDocsDir,
|
||||
updateQuerySettings,
|
||||
resetVectorDB,
|
||||
getEmbeddingConfig,
|
||||
@ -21,7 +20,7 @@
|
||||
|
||||
import { knowledge, models } from '$lib/stores';
|
||||
import { getKnowledgeItems } from '$lib/apis/knowledge';
|
||||
import { deleteAllFiles, deleteFileById } from '$lib/apis/files';
|
||||
import { uploadDir, deleteAllFiles, deleteFileById } from '$lib/apis/files';
|
||||
|
||||
import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
|
||||
import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
|
||||
@ -65,11 +64,10 @@
|
||||
|
||||
const scanHandler = async () => {
|
||||
scanDirLoading = true;
|
||||
const res = await processDocsDir(localStorage.token);
|
||||
const res = await uploadDir(localStorage.token);
|
||||
scanDirLoading = false;
|
||||
|
||||
if (res) {
|
||||
await knowledge.set(await getKnowledgeItems(localStorage.token));
|
||||
toast.success($i18n.t('Scan complete!'));
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user