diff --git a/backend/open_webui/apps/retrieval/loader/main.py b/backend/open_webui/apps/retrieval/loader/main.py index b435fa21f..f0e8f804e 100644 --- a/backend/open_webui/apps/retrieval/loader/main.py +++ b/backend/open_webui/apps/retrieval/loader/main.py @@ -2,7 +2,6 @@ import requests import logging import ftfy - from langchain_community.document_loaders import ( BSHTMLLoader, CSVLoader, @@ -24,7 +23,6 @@ from open_webui.env import SRC_LOG_LEVELS log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) - known_source_ext = [ "go", "py", diff --git a/backend/open_webui/apps/retrieval/main.py b/backend/open_webui/apps/retrieval/main.py index 9c2ec141f..a3e828978 100644 --- a/backend/open_webui/apps/retrieval/main.py +++ b/backend/open_webui/apps/retrieval/main.py @@ -725,8 +725,16 @@ def process_file( PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES, ) docs = loader.load(file.filename, file.meta.get("content_type"), file_path) - raw_content = " ".join([doc.page_content for doc in docs]) - print(raw_content) + raw_text_content = " ".join([doc.page_content for doc in docs]) + + Files.update_files_metadata_by_id( + form_data.file_id, + { + "content": { + "text": raw_text_content, + } + }, + ) try: result = save_docs_to_vector_db( diff --git a/backend/open_webui/apps/webui/models/files.py b/backend/open_webui/apps/webui/models/files.py index 7fba74479..cf572ac78 100644 --- a/backend/open_webui/apps/webui/models/files.py +++ b/backend/open_webui/apps/webui/models/files.py @@ -97,6 +97,17 @@ class FilesTable: for file in db.query(File).filter_by(user_id=user_id).all() ] + def update_files_metadata_by_id(self, id: str, meta: dict) -> Optional[FileModel]: + with get_db() as db: + try: + file = db.query(File).filter_by(id=id).first() + file.meta = {**file.meta, **meta} + db.commit() + + return FileModel.model_validate(file) + except Exception: + return None + def delete_file_by_id(self, id: str) -> bool: with get_db() as db: try: diff --git a/backend/open_webui/apps/webui/routers/files.py b/backend/open_webui/apps/webui/routers/files.py index 1a326bcd8..f46a7992d 100644 --- a/backend/open_webui/apps/webui/routers/files.py +++ b/backend/open_webui/apps/webui/routers/files.py @@ -171,6 +171,19 @@ async def get_file_content_by_id(id: str, user=Depends(get_verified_user)): ) +@router.get("/{id}/content/text") +async def get_file_text_content_by_id(id: str, user=Depends(get_verified_user)): + file = Files.get_file_by_id(id) + + if file and (file.user_id == user.id or user.role == "admin"): + return {"text": file.meta.get("content", {}).get("text", None)} + else: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=ERROR_MESSAGES.NOT_FOUND, + ) + + @router.get("/{id}/content/{file_name}", response_model=Optional[FileModel]) async def get_file_content_by_id(id: str, user=Depends(get_verified_user)): file = Files.get_file_by_id(id)