From b8b994a82040a79c37bd6e73bc11f1dd264e2c61 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Sat, 28 Sep 2024 02:49:18 +0200 Subject: [PATCH] refac --- backend/open_webui/apps/retrieval/loader/main.py | 11 ++++++++++- backend/open_webui/apps/retrieval/main.py | 2 -- backend/requirements.txt | 2 ++ pyproject.toml | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/backend/open_webui/apps/retrieval/loader/main.py b/backend/open_webui/apps/retrieval/loader/main.py index f4c948b43..b435fa21f 100644 --- a/backend/open_webui/apps/retrieval/loader/main.py +++ b/backend/open_webui/apps/retrieval/loader/main.py @@ -1,5 +1,7 @@ import requests import logging +import ftfy + from langchain_community.document_loaders import ( BSHTMLLoader, @@ -122,7 +124,14 @@ class Loader: self, filename: str, file_content_type: str, file_path: str ) -> list[Document]: loader = self._get_loader(filename, file_content_type, file_path) - return loader.load() + docs = loader.load() + + return [ + Document( + page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata + ) + for doc in docs + ] def _get_loader(self, filename: str, file_content_type: str, file_path: str): file_ext = filename.split(".")[-1].lower() diff --git a/backend/open_webui/apps/retrieval/main.py b/backend/open_webui/apps/retrieval/main.py index f2f4733c5..9c2ec141f 100644 --- a/backend/open_webui/apps/retrieval/main.py +++ b/backend/open_webui/apps/retrieval/main.py @@ -725,7 +725,6 @@ def process_file( PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES, ) docs = loader.load(file.filename, file.meta.get("content_type"), file_path) - raw_content = " ".join([doc.page_content for doc in docs]) print(raw_content) @@ -872,7 +871,6 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u translation=app.state.YOUTUBE_LOADER_TRANSLATION, ) docs = loader.load() - save_docs_to_vector_db(docs, collection_name, overwrite=True) return { diff --git a/backend/requirements.txt b/backend/requirements.txt index 764e41d3d..a6933d20a 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -46,6 +46,8 @@ sentence-transformers==3.0.1 colbert-ai==0.2.21 einops==0.8.0 + +ftfy==6.2.3 pypdf==4.3.1 docx2txt==0.8 python-pptx==1.0.0 diff --git a/pyproject.toml b/pyproject.toml index d02281d52..1df284f80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,8 @@ dependencies = [ "colbert-ai==0.2.21", "einops==0.8.0", + + "ftfy==6.2.3", "pypdf==4.3.1", "docx2txt==0.8", "python-pptx==1.0.0",