diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 295d0414a..71c80f52b 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -181,13 +181,16 @@ class Loader: for doc in docs ] + def _is_text_file(self, file_ext: str, file_content_type: str) -> bool: + return file_ext in known_source_ext or ( + file_content_type and file_content_type.find("text/") >= 0 + ) + def _get_loader(self, filename: str, file_content_type: str, file_path: str): file_ext = filename.split(".")[-1].lower() if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"): - if file_ext in known_source_ext or ( - file_content_type and file_content_type.find("text/") >= 0 - ): + if self._is_text_file(file_ext, file_content_type): loader = TextLoader(file_path, autodetect_encoding=True) else: loader = TikaLoader( @@ -196,11 +199,14 @@ class Loader: mime_type=file_content_type, ) elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"): - loader = DoclingLoader( - url=self.kwargs.get("DOCLING_SERVER_URL"), - file_path=file_path, - mime_type=file_content_type, - ) + if self._is_text_file(file_ext, file_content_type): + loader = TextLoader(file_path, autodetect_encoding=True) + else: + loader = DoclingLoader( + url=self.kwargs.get("DOCLING_SERVER_URL"), + file_path=file_path, + mime_type=file_content_type, + ) elif ( self.engine == "document_intelligence" and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != "" @@ -257,9 +263,7 @@ class Loader: loader = UnstructuredPowerPointLoader(file_path) elif file_ext == "msg": loader = OutlookMessageLoader(file_path) - elif file_ext in known_source_ext or ( - file_content_type and file_content_type.find("text/") >= 0 - ): + elif self._is_text_file(file_ext, file_content_type): loader = TextLoader(file_path, autodetect_encoding=True) else: loader = TextLoader(file_path, autodetect_encoding=True)