mirror of
https://github.com/open-webui/open-webui
synced 2025-05-01 19:44:38 +00:00
Merge pull request #12486 from FabioPolito24/text-file-handling-docling
fix: text file handling with docling
This commit is contained in:
commit
ef787e4a79
@ -184,13 +184,16 @@ class Loader:
|
||||
for doc in docs
|
||||
]
|
||||
|
||||
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
|
||||
return file_ext in known_source_ext or (
|
||||
file_content_type and file_content_type.find("text/") >= 0
|
||||
)
|
||||
|
||||
def _get_loader(self, filename: str, file_content_type: str, file_path: str):
|
||||
file_ext = filename.split(".")[-1].lower()
|
||||
|
||||
if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
|
||||
if file_ext in known_source_ext or (
|
||||
file_content_type and file_content_type.find("text/") >= 0
|
||||
):
|
||||
if self._is_text_file(file_ext, file_content_type):
|
||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||
else:
|
||||
loader = TikaLoader(
|
||||
@ -199,11 +202,14 @@ class Loader:
|
||||
mime_type=file_content_type,
|
||||
)
|
||||
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
||||
loader = DoclingLoader(
|
||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||
file_path=file_path,
|
||||
mime_type=file_content_type,
|
||||
)
|
||||
if self._is_text_file(file_ext, file_content_type):
|
||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||
else:
|
||||
loader = DoclingLoader(
|
||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||
file_path=file_path,
|
||||
mime_type=file_content_type,
|
||||
)
|
||||
elif (
|
||||
self.engine == "document_intelligence"
|
||||
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
||||
@ -269,9 +275,7 @@ class Loader:
|
||||
loader = UnstructuredPowerPointLoader(file_path)
|
||||
elif file_ext == "msg":
|
||||
loader = OutlookMessageLoader(file_path)
|
||||
elif file_ext in known_source_ext or (
|
||||
file_content_type and file_content_type.find("text/") >= 0
|
||||
):
|
||||
elif self._is_text_file(file_ext, file_content_type):
|
||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||
else:
|
||||
loader = TextLoader(file_path, autodetect_encoding=True)
|
||||
|
Loading…
Reference in New Issue
Block a user