mirror of
https://github.com/open-webui/open-webui
synced 2025-05-23 06:14:25 +00:00
Merge pull request #12486 from FabioPolito24/text-file-handling-docling
fix: text file handling with docling
This commit is contained in:
commit
ef787e4a79
@ -184,13 +184,16 @@ class Loader:
|
|||||||
for doc in docs
|
for doc in docs
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
|
||||||
|
return file_ext in known_source_ext or (
|
||||||
|
file_content_type and file_content_type.find("text/") >= 0
|
||||||
|
)
|
||||||
|
|
||||||
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
||||||
file_ext = filename.split(".")[-1].lower()
|
file_ext = filename.split(".")[-1].lower()
|
||||||
|
|
||||||
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
|
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
|
||||||
if file_ext in known_source_ext or (
|
if self._is_text_file(file_ext, file_content_type):
|
||||||
file_content_type and file_content_type.find("text/") >= 0
|
|
||||||
):
|
|
||||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
loader = TikaLoader(
|
loader = TikaLoader(
|
||||||
@ -199,11 +202,14 @@ class Loader:
|
|||||||
mime_type=file_content_type,
|
mime_type=file_content_type,
|
||||||
)
|
)
|
||||||
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
||||||
loader = DoclingLoader(
|
if self._is_text_file(file_ext, file_content_type):
|
||||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
file_path=file_path,
|
else:
|
||||||
mime_type=file_content_type,
|
loader = DoclingLoader(
|
||||||
)
|
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||||
|
file_path=file_path,
|
||||||
|
mime_type=file_content_type,
|
||||||
|
)
|
||||||
elif (
|
elif (
|
||||||
self.engine == "document_intelligence"
|
self.engine == "document_intelligence"
|
||||||
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
||||||
@ -269,9 +275,7 @@ class Loader:
|
|||||||
loader = UnstructuredPowerPointLoader(file_path)
|
loader = UnstructuredPowerPointLoader(file_path)
|
||||||
elif file_ext == "msg":
|
elif file_ext == "msg":
|
||||||
loader = OutlookMessageLoader(file_path)
|
loader = OutlookMessageLoader(file_path)
|
||||||
elif file_ext in known_source_ext or (
|
elif self._is_text_file(file_ext, file_content_type):
|
||||||
file_content_type and file_content_type.find("text/") >= 0
|
|
||||||
):
|
|
||||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user