mirror of
https://github.com/open-webui/open-webui
synced 2025-05-29 17:52:48 +00:00
fix: fix for text file handling with docling
This commit is contained in:
parent
04799f1f95
commit
cd0a1b4852
@ -181,13 +181,16 @@ class Loader:
|
|||||||
for doc in docs
|
for doc in docs
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
|
||||||
|
return file_ext in known_source_ext or (
|
||||||
|
file_content_type and file_content_type.find("text/") >= 0
|
||||||
|
)
|
||||||
|
|
||||||
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
||||||
file_ext = filename.split(".")[-1].lower()
|
file_ext = filename.split(".")[-1].lower()
|
||||||
|
|
||||||
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
|
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
|
||||||
if file_ext in known_source_ext or (
|
if self._is_text_file(file_ext, file_content_type):
|
||||||
file_content_type and file_content_type.find("text/") >= 0
|
|
||||||
):
|
|
||||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
loader = TikaLoader(
|
loader = TikaLoader(
|
||||||
@ -196,11 +199,14 @@ class Loader:
|
|||||||
mime_type=file_content_type,
|
mime_type=file_content_type,
|
||||||
)
|
)
|
||||||
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
||||||
loader = DoclingLoader(
|
if self._is_text_file(file_ext, file_content_type):
|
||||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
file_path=file_path,
|
else:
|
||||||
mime_type=file_content_type,
|
loader = DoclingLoader(
|
||||||
)
|
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||||
|
file_path=file_path,
|
||||||
|
mime_type=file_content_type,
|
||||||
|
)
|
||||||
elif (
|
elif (
|
||||||
self.engine == "document_intelligence"
|
self.engine == "document_intelligence"
|
||||||
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
||||||
@ -257,9 +263,7 @@ class Loader:
|
|||||||
loader = UnstructuredPowerPointLoader(file_path)
|
loader = UnstructuredPowerPointLoader(file_path)
|
||||||
elif file_ext == "msg":
|
elif file_ext == "msg":
|
||||||
loader = OutlookMessageLoader(file_path)
|
loader = OutlookMessageLoader(file_path)
|
||||||
elif file_ext in known_source_ext or (
|
elif self._is_text_file(file_ext, file_content_type):
|
||||||
file_content_type and file_content_type.find("text/") >= 0
|
|
||||||
):
|
|
||||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user