diff --git a/backend/open_webui/apps/retrieval/main.py b/backend/open_webui/apps/retrieval/main.py index e3da662be..ec22b318c 100644 --- a/backend/open_webui/apps/retrieval/main.py +++ b/backend/open_webui/apps/retrieval/main.py @@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import ( query_doc_with_hybrid_search, ) -from open_webui.apps.webui.models.documents import DocumentForm, Documents from open_webui.apps.webui.models.files import Files from open_webui.config import ( BRAVE_SEARCH_API_KEY, @@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)): ) -@app.get("/process/dir") -def process_docs_dir(user=Depends(get_admin_user)): - for path in Path(DOCS_DIR).rglob("./**/*"): - try: - if path.is_file() and not path.name.startswith("."): - tags = extract_folders_after_data_docs(path) - filename = path.name - file_content_type = mimetypes.guess_type(path) - - with open(path, "rb") as f: - collection_name = calculate_sha256(f)[:63] - - loader = Loader( - engine=app.state.config.CONTENT_EXTRACTION_ENGINE, - TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL, - PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES, - ) - docs = loader.load(filename, file_content_type[0], str(path)) - - try: - result = save_docs_to_vector_db(docs, collection_name) - - if result: - sanitized_filename = sanitize_filename(filename) - doc = Documents.get_doc_by_name(sanitized_filename) - - if doc is None: - doc = Documents.insert_new_doc( - user.id, - DocumentForm( - **{ - "name": sanitized_filename, - "title": filename, - "collection_name": collection_name, - "filename": filename, - "content": ( - json.dumps( - { - "tags": list( - map( - lambda name: {"name": name}, - tags, - ) - ) - } - ) - if len(tags) - else "{}" - ), - } - ), - ) - except Exception as e: - log.exception(e) - pass - - except Exception as e: - log.exception(e) - - return True - - class QueryDocForm(BaseModel): collection_name: str query: str diff --git a/backend/open_webui/apps/webui/routers/files.py b/backend/open_webui/apps/webui/routers/files.py index 4d688b1ba..70d58cc40 100644 --- a/backend/open_webui/apps/webui/routers/files.py +++ b/backend/open_webui/apps/webui/routers/files.py @@ -5,17 +5,21 @@ import uuid from pathlib import Path from typing import Optional from pydantic import BaseModel +import mimetypes + from open_webui.apps.webui.models.files import FileForm, FileModel, Files from open_webui.apps.retrieval.main import process_file, ProcessFileForm -from open_webui.config import UPLOAD_DIR -from open_webui.constants import ERROR_MESSAGES +from open_webui.config import UPLOAD_DIR, DOCS_DIR from open_webui.env import SRC_LOG_LEVELS +from open_webui.constants import ERROR_MESSAGES from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status from fastapi.responses import FileResponse, StreamingResponse + + from open_webui.utils.utils import get_admin_user, get_verified_user log = logging.getLogger(__name__) @@ -86,6 +90,51 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)): ) +@router.post("/upload/dir") +def upload_dir(user=Depends(get_admin_user)): + for path in Path(DOCS_DIR).rglob("./**/*"): + if path.is_file() and not path.name.startswith("."): + try: + log.debug(f"Processing file from path: {path}") + + filename = path.name + file_content_type = mimetypes.guess_type(path) + + # replace filename with uuid + id = str(uuid.uuid4()) + name = filename + + contents = path.read_bytes() + file_path = str(path) + + file = Files.insert_new_file( + user.id, + FileForm( + **{ + "id": id, + "filename": filename, + "meta": { + "name": name, + "content_type": file_content_type, + "size": len(contents), + "path": file_path, + }, + } + ), + ) + + try: + process_file(ProcessFileForm(file_id=id)) + log.debug(f"File processed: {path}, {file.id}") + except Exception as e: + log.exception(e) + log.error(f"Error processing file: {file.id}") + except Exception as e: + log.exception(e) + pass + return True + + ############################ # List Files ############################ diff --git a/src/lib/apis/files/index.ts b/src/lib/apis/files/index.ts index b76143471..6a42ec614 100644 --- a/src/lib/apis/files/index.ts +++ b/src/lib/apis/files/index.ts @@ -30,6 +30,32 @@ export const uploadFile = async (token: string, file: File) => { return res; }; +export const uploadDir = async (token: string) => { + let error = null; + + const res = await fetch(`${WEBUI_API_BASE_URL}/files/upload/dir`, { + method: 'POST', + headers: { + Accept: 'application/json', + authorization: `Bearer ${token}` + } + }) + .then(async (res) => { + if (!res.ok) throw await res.json(); + return res.json(); + }) + .catch((err) => { + error = err.detail; + return null; + }); + + if (error) { + throw error; + } + + return res; +}; + export const getFiles = async (token: string = '') => { let error = null; diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts index e88c842d9..9f49e9c0f 100644 --- a/src/lib/apis/retrieval/index.ts +++ b/src/lib/apis/retrieval/index.ts @@ -342,32 +342,6 @@ export const processFile = async ( return res; }; -export const processDocsDir = async (token: string) => { - let error = null; - - const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/dir`, { - method: 'GET', - headers: { - Accept: 'application/json', - authorization: `Bearer ${token}` - } - }) - .then(async (res) => { - if (!res.ok) throw await res.json(); - return res.json(); - }) - .catch((err) => { - error = err.detail; - return null; - }); - - if (error) { - throw error; - } - - return res; -}; - export const processYoutubeVideo = async (token: string, url: string) => { let error = null; diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index d4ff6bd66..5c44938f7 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -7,7 +7,6 @@ import { getQuerySettings, - processDocsDir, updateQuerySettings, resetVectorDB, getEmbeddingConfig, @@ -21,7 +20,7 @@ import { knowledge, models } from '$lib/stores'; import { getKnowledgeItems } from '$lib/apis/knowledge'; - import { deleteAllFiles, deleteFileById } from '$lib/apis/files'; + import { uploadDir, deleteAllFiles, deleteFileById } from '$lib/apis/files'; import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte'; import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte'; @@ -65,11 +64,10 @@ const scanHandler = async () => { scanDirLoading = true; - const res = await processDocsDir(localStorage.token); + const res = await uploadDir(localStorage.token); scanDirLoading = false; if (res) { - await knowledge.set(await getKnowledgeItems(localStorage.token)); toast.success($i18n.t('Scan complete!')); } };