This commit is contained in:
Timothy J. Baek 2024-09-28 02:49:18 +02:00
parent 00eb022450
commit b8b994a820
4 changed files with 14 additions and 3 deletions

View File

@ -1,5 +1,7 @@
import requests
import logging
import ftfy
from langchain_community.document_loaders import (
BSHTMLLoader,
@ -122,7 +124,14 @@ class Loader:
self, filename: str, file_content_type: str, file_path: str
) -> list[Document]:
loader = self._get_loader(filename, file_content_type, file_path)
return loader.load()
docs = loader.load()
return [
Document(
page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata
)
for doc in docs
]
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
file_ext = filename.split(".")[-1].lower()

View File

@ -725,7 +725,6 @@ def process_file(
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
)
docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
raw_content = " ".join([doc.page_content for doc in docs])
print(raw_content)
@ -872,7 +871,6 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
translation=app.state.YOUTUBE_LOADER_TRANSLATION,
)
docs = loader.load()
save_docs_to_vector_db(docs, collection_name, overwrite=True)
return {

View File

@ -46,6 +46,8 @@ sentence-transformers==3.0.1
colbert-ai==0.2.21
einops==0.8.0
ftfy==6.2.3
pypdf==4.3.1
docx2txt==0.8
python-pptx==1.0.0

View File

@ -53,6 +53,8 @@ dependencies = [
"colbert-ai==0.2.21",
"einops==0.8.0",
"ftfy==6.2.3",
"pypdf==4.3.1",
"docx2txt==0.8",
"python-pptx==1.0.0",