mirror of
https://github.com/open-webui/open-webui
synced 2025-03-24 14:40:51 +00:00
refac: process docs dir
This commit is contained in:
parent
9ad5ffb8c1
commit
a6c797d4c2
backend/open_webui/apps
src/lib
@ -44,7 +44,6 @@ from open_webui.apps.retrieval.utils import (
|
|||||||
query_doc_with_hybrid_search,
|
query_doc_with_hybrid_search,
|
||||||
)
|
)
|
||||||
|
|
||||||
from open_webui.apps.webui.models.documents import DocumentForm, Documents
|
|
||||||
from open_webui.apps.webui.models.files import Files
|
from open_webui.apps.webui.models.files import Files
|
||||||
from open_webui.config import (
|
from open_webui.config import (
|
||||||
BRAVE_SEARCH_API_KEY,
|
BRAVE_SEARCH_API_KEY,
|
||||||
@ -1100,68 +1099,6 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/process/dir")
|
|
||||||
def process_docs_dir(user=Depends(get_admin_user)):
|
|
||||||
for path in Path(DOCS_DIR).rglob("./**/*"):
|
|
||||||
try:
|
|
||||||
if path.is_file() and not path.name.startswith("."):
|
|
||||||
tags = extract_folders_after_data_docs(path)
|
|
||||||
filename = path.name
|
|
||||||
file_content_type = mimetypes.guess_type(path)
|
|
||||||
|
|
||||||
with open(path, "rb") as f:
|
|
||||||
collection_name = calculate_sha256(f)[:63]
|
|
||||||
|
|
||||||
loader = Loader(
|
|
||||||
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
|
|
||||||
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
|
|
||||||
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
|
||||||
)
|
|
||||||
docs = loader.load(filename, file_content_type[0], str(path))
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = save_docs_to_vector_db(docs, collection_name)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
sanitized_filename = sanitize_filename(filename)
|
|
||||||
doc = Documents.get_doc_by_name(sanitized_filename)
|
|
||||||
|
|
||||||
if doc is None:
|
|
||||||
doc = Documents.insert_new_doc(
|
|
||||||
user.id,
|
|
||||||
DocumentForm(
|
|
||||||
**{
|
|
||||||
"name": sanitized_filename,
|
|
||||||
"title": filename,
|
|
||||||
"collection_name": collection_name,
|
|
||||||
"filename": filename,
|
|
||||||
"content": (
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"tags": list(
|
|
||||||
map(
|
|
||||||
lambda name: {"name": name},
|
|
||||||
tags,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if len(tags)
|
|
||||||
else "{}"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
log.exception(e)
|
|
||||||
pass
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
log.exception(e)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class QueryDocForm(BaseModel):
|
class QueryDocForm(BaseModel):
|
||||||
collection_name: str
|
collection_name: str
|
||||||
query: str
|
query: str
|
||||||
|
@ -5,17 +5,21 @@ import uuid
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
|
||||||
from open_webui.apps.webui.models.files import FileForm, FileModel, Files
|
from open_webui.apps.webui.models.files import FileForm, FileModel, Files
|
||||||
from open_webui.apps.retrieval.main import process_file, ProcessFileForm
|
from open_webui.apps.retrieval.main import process_file, ProcessFileForm
|
||||||
|
|
||||||
from open_webui.config import UPLOAD_DIR
|
from open_webui.config import UPLOAD_DIR, DOCS_DIR
|
||||||
from open_webui.constants import ERROR_MESSAGES
|
|
||||||
from open_webui.env import SRC_LOG_LEVELS
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
|
from open_webui.constants import ERROR_MESSAGES
|
||||||
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
|
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
|
||||||
from fastapi.responses import FileResponse, StreamingResponse
|
from fastapi.responses import FileResponse, StreamingResponse
|
||||||
|
|
||||||
|
|
||||||
from open_webui.utils.utils import get_admin_user, get_verified_user
|
from open_webui.utils.utils import get_admin_user, get_verified_user
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@ -86,6 +90,51 @@ def upload_file(file: UploadFile = File(...), user=Depends(get_verified_user)):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/upload/dir")
|
||||||
|
def upload_dir(user=Depends(get_admin_user)):
|
||||||
|
for path in Path(DOCS_DIR).rglob("./**/*"):
|
||||||
|
if path.is_file() and not path.name.startswith("."):
|
||||||
|
try:
|
||||||
|
log.debug(f"Processing file from path: {path}")
|
||||||
|
|
||||||
|
filename = path.name
|
||||||
|
file_content_type = mimetypes.guess_type(path)
|
||||||
|
|
||||||
|
# replace filename with uuid
|
||||||
|
id = str(uuid.uuid4())
|
||||||
|
name = filename
|
||||||
|
|
||||||
|
contents = path.read_bytes()
|
||||||
|
file_path = str(path)
|
||||||
|
|
||||||
|
file = Files.insert_new_file(
|
||||||
|
user.id,
|
||||||
|
FileForm(
|
||||||
|
**{
|
||||||
|
"id": id,
|
||||||
|
"filename": filename,
|
||||||
|
"meta": {
|
||||||
|
"name": name,
|
||||||
|
"content_type": file_content_type,
|
||||||
|
"size": len(contents),
|
||||||
|
"path": file_path,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
process_file(ProcessFileForm(file_id=id))
|
||||||
|
log.debug(f"File processed: {path}, {file.id}")
|
||||||
|
except Exception as e:
|
||||||
|
log.exception(e)
|
||||||
|
log.error(f"Error processing file: {file.id}")
|
||||||
|
except Exception as e:
|
||||||
|
log.exception(e)
|
||||||
|
pass
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
############################
|
############################
|
||||||
# List Files
|
# List Files
|
||||||
############################
|
############################
|
||||||
|
@ -30,6 +30,32 @@ export const uploadFile = async (token: string, file: File) => {
|
|||||||
return res;
|
return res;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const uploadDir = async (token: string) => {
|
||||||
|
let error = null;
|
||||||
|
|
||||||
|
const res = await fetch(`${WEBUI_API_BASE_URL}/files/upload/dir`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
Accept: 'application/json',
|
||||||
|
authorization: `Bearer ${token}`
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.then(async (res) => {
|
||||||
|
if (!res.ok) throw await res.json();
|
||||||
|
return res.json();
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
error = err.detail;
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (error) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
};
|
||||||
|
|
||||||
export const getFiles = async (token: string = '') => {
|
export const getFiles = async (token: string = '') => {
|
||||||
let error = null;
|
let error = null;
|
||||||
|
|
||||||
|
@ -342,32 +342,6 @@ export const processFile = async (
|
|||||||
return res;
|
return res;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const processDocsDir = async (token: string) => {
|
|
||||||
let error = null;
|
|
||||||
|
|
||||||
const res = await fetch(`${RETRIEVAL_API_BASE_URL}/process/dir`, {
|
|
||||||
method: 'GET',
|
|
||||||
headers: {
|
|
||||||
Accept: 'application/json',
|
|
||||||
authorization: `Bearer ${token}`
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.then(async (res) => {
|
|
||||||
if (!res.ok) throw await res.json();
|
|
||||||
return res.json();
|
|
||||||
})
|
|
||||||
.catch((err) => {
|
|
||||||
error = err.detail;
|
|
||||||
return null;
|
|
||||||
});
|
|
||||||
|
|
||||||
if (error) {
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
};
|
|
||||||
|
|
||||||
export const processYoutubeVideo = async (token: string, url: string) => {
|
export const processYoutubeVideo = async (token: string, url: string) => {
|
||||||
let error = null;
|
let error = null;
|
||||||
|
|
||||||
|
@ -7,7 +7,6 @@
|
|||||||
|
|
||||||
import {
|
import {
|
||||||
getQuerySettings,
|
getQuerySettings,
|
||||||
processDocsDir,
|
|
||||||
updateQuerySettings,
|
updateQuerySettings,
|
||||||
resetVectorDB,
|
resetVectorDB,
|
||||||
getEmbeddingConfig,
|
getEmbeddingConfig,
|
||||||
@ -21,7 +20,7 @@
|
|||||||
|
|
||||||
import { knowledge, models } from '$lib/stores';
|
import { knowledge, models } from '$lib/stores';
|
||||||
import { getKnowledgeItems } from '$lib/apis/knowledge';
|
import { getKnowledgeItems } from '$lib/apis/knowledge';
|
||||||
import { deleteAllFiles, deleteFileById } from '$lib/apis/files';
|
import { uploadDir, deleteAllFiles, deleteFileById } from '$lib/apis/files';
|
||||||
|
|
||||||
import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
|
import ResetUploadDirConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
|
||||||
import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
|
import ResetVectorDBConfirmDialog from '$lib/components/common/ConfirmDialog.svelte';
|
||||||
@ -65,11 +64,10 @@
|
|||||||
|
|
||||||
const scanHandler = async () => {
|
const scanHandler = async () => {
|
||||||
scanDirLoading = true;
|
scanDirLoading = true;
|
||||||
const res = await processDocsDir(localStorage.token);
|
const res = await uploadDir(localStorage.token);
|
||||||
scanDirLoading = false;
|
scanDirLoading = false;
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
await knowledge.set(await getKnowledgeItems(localStorage.token));
|
|
||||||
toast.success($i18n.t('Scan complete!'));
|
toast.success($i18n.t('Scan complete!'));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user