mirror of
https://github.com/open-webui/open-webui
synced 2025-06-26 18:26:48 +00:00
fix: tikaloader extract images
This commit is contained in:
parent
413297b5c7
commit
27da31dc83
@ -85,11 +85,13 @@ known_source_ext = [
|
|||||||
|
|
||||||
|
|
||||||
class TikaLoader:
|
class TikaLoader:
|
||||||
def __init__(self, url, file_path, mime_type=None):
|
def __init__(self, url, file_path, mime_type=None, extract_images=None):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.mime_type = mime_type
|
self.mime_type = mime_type
|
||||||
|
|
||||||
|
self.exextract_images = extract_images
|
||||||
|
|
||||||
def load(self) -> list[Document]:
|
def load(self) -> list[Document]:
|
||||||
with open(self.file_path, "rb") as f:
|
with open(self.file_path, "rb") as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
@ -99,7 +101,7 @@ class TikaLoader:
|
|||||||
else:
|
else:
|
||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
if self.kwargs.get("PDF_EXTRACT_IMAGES") == True:
|
if self.extract_images == True:
|
||||||
headers["X-Tika-PDFextractInlineImages"] = "true"
|
headers["X-Tika-PDFextractInlineImages"] = "true"
|
||||||
|
|
||||||
endpoint = self.url
|
endpoint = self.url
|
||||||
@ -213,6 +215,7 @@ class Loader:
|
|||||||
url=self.kwargs.get("TIKA_SERVER_URL"),
|
url=self.kwargs.get("TIKA_SERVER_URL"),
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
mime_type=file_content_type,
|
mime_type=file_content_type,
|
||||||
|
extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
|
||||||
)
|
)
|
||||||
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
||||||
if self._is_text_file(file_ext, file_content_type):
|
if self._is_text_file(file_ext, file_content_type):
|
||||||
|
Loading…
Reference in New Issue
Block a user