This commit is contained in:
Timothy J. Baek 2024-09-28 02:56:56 +02:00
parent b8b994a820
commit 9d2ed3d2be
4 changed files with 34 additions and 4 deletions

View File

@ -2,7 +2,6 @@ import requests
import logging import logging
import ftfy import ftfy
from langchain_community.document_loaders import ( from langchain_community.document_loaders import (
BSHTMLLoader, BSHTMLLoader,
CSVLoader, CSVLoader,
@ -24,7 +23,6 @@ from open_webui.env import SRC_LOG_LEVELS
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"]) log.setLevel(SRC_LOG_LEVELS["RAG"])
known_source_ext = [ known_source_ext = [
"go", "go",
"py", "py",

View File

@ -725,8 +725,16 @@ def process_file(
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES, PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
) )
docs = loader.load(file.filename, file.meta.get("content_type"), file_path) docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
raw_content = " ".join([doc.page_content for doc in docs]) raw_text_content = " ".join([doc.page_content for doc in docs])
print(raw_content)
Files.update_files_metadata_by_id(
form_data.file_id,
{
"content": {
"text": raw_text_content,
}
},
)
try: try:
result = save_docs_to_vector_db( result = save_docs_to_vector_db(

View File

@ -97,6 +97,17 @@ class FilesTable:
for file in db.query(File).filter_by(user_id=user_id).all() for file in db.query(File).filter_by(user_id=user_id).all()
] ]
def update_files_metadata_by_id(self, id: str, meta: dict) -> Optional[FileModel]:
with get_db() as db:
try:
file = db.query(File).filter_by(id=id).first()
file.meta = {**file.meta, **meta}
db.commit()
return FileModel.model_validate(file)
except Exception:
return None
def delete_file_by_id(self, id: str) -> bool: def delete_file_by_id(self, id: str) -> bool:
with get_db() as db: with get_db() as db:
try: try:

View File

@ -171,6 +171,19 @@ async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
) )
@router.get("/{id}/content/text")
async def get_file_text_content_by_id(id: str, user=Depends(get_verified_user)):
file = Files.get_file_by_id(id)
if file and (file.user_id == user.id or user.role == "admin"):
return {"text": file.meta.get("content", {}).get("text", None)}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
@router.get("/{id}/content/{file_name}", response_model=Optional[FileModel]) @router.get("/{id}/content/{file_name}", response_model=Optional[FileModel])
async def get_file_content_by_id(id: str, user=Depends(get_verified_user)): async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
file = Files.get_file_by_id(id) file = Files.get_file_by_id(id)