mirror of
https://github.com/open-webui/open-webui
synced 2024-11-16 05:24:02 +00:00
refac
This commit is contained in:
parent
00eb022450
commit
b8b994a820
@ -1,5 +1,7 @@
|
|||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
import ftfy
|
||||||
|
|
||||||
|
|
||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
BSHTMLLoader,
|
BSHTMLLoader,
|
||||||
@ -122,7 +124,14 @@ class Loader:
|
|||||||
self, filename: str, file_content_type: str, file_path: str
|
self, filename: str, file_content_type: str, file_path: str
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
loader = self._get_loader(filename, file_content_type, file_path)
|
loader = self._get_loader(filename, file_content_type, file_path)
|
||||||
return loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata
|
||||||
|
)
|
||||||
|
for doc in docs
|
||||||
|
]
|
||||||
|
|
||||||
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
||||||
file_ext = filename.split(".")[-1].lower()
|
file_ext = filename.split(".")[-1].lower()
|
||||||
|
@ -725,7 +725,6 @@ def process_file(
|
|||||||
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
)
|
)
|
||||||
docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
|
docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
|
||||||
|
|
||||||
raw_content = " ".join([doc.page_content for doc in docs])
|
raw_content = " ".join([doc.page_content for doc in docs])
|
||||||
print(raw_content)
|
print(raw_content)
|
||||||
|
|
||||||
@ -872,7 +871,6 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
|
|||||||
translation=app.state.YOUTUBE_LOADER_TRANSLATION,
|
translation=app.state.YOUTUBE_LOADER_TRANSLATION,
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
save_docs_to_vector_db(docs, collection_name, overwrite=True)
|
save_docs_to_vector_db(docs, collection_name, overwrite=True)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -46,6 +46,8 @@ sentence-transformers==3.0.1
|
|||||||
colbert-ai==0.2.21
|
colbert-ai==0.2.21
|
||||||
einops==0.8.0
|
einops==0.8.0
|
||||||
|
|
||||||
|
|
||||||
|
ftfy==6.2.3
|
||||||
pypdf==4.3.1
|
pypdf==4.3.1
|
||||||
docx2txt==0.8
|
docx2txt==0.8
|
||||||
python-pptx==1.0.0
|
python-pptx==1.0.0
|
||||||
|
@ -53,6 +53,8 @@ dependencies = [
|
|||||||
"colbert-ai==0.2.21",
|
"colbert-ai==0.2.21",
|
||||||
"einops==0.8.0",
|
"einops==0.8.0",
|
||||||
|
|
||||||
|
|
||||||
|
"ftfy==6.2.3",
|
||||||
"pypdf==4.3.1",
|
"pypdf==4.3.1",
|
||||||
"docx2txt==0.8",
|
"docx2txt==0.8",
|
||||||
"python-pptx==1.0.0",
|
"python-pptx==1.0.0",
|
||||||
|
Loading…
Reference in New Issue
Block a user