fix: tikaloader extract images

This commit is contained in:
Timothy Jaeryang Baek 2025-05-05 23:40:34 +04:00
parent 413297b5c7
commit 27da31dc83

View File

@ -85,11 +85,13 @@ known_source_ext = [
class TikaLoader: class TikaLoader:
def __init__(self, url, file_path, mime_type=None): def __init__(self, url, file_path, mime_type=None, extract_images=None):
self.url = url self.url = url
self.file_path = file_path self.file_path = file_path
self.mime_type = mime_type self.mime_type = mime_type
self.exextract_images = extract_images
def load(self) -> list[Document]: def load(self) -> list[Document]:
with open(self.file_path, "rb") as f: with open(self.file_path, "rb") as f:
data = f.read() data = f.read()
@ -99,7 +101,7 @@ class TikaLoader:
else: else:
headers = {} headers = {}
if self.kwargs.get("PDF_EXTRACT_IMAGES") == True: if self.extract_images == True:
headers["X-Tika-PDFextractInlineImages"] = "true" headers["X-Tika-PDFextractInlineImages"] = "true"
endpoint = self.url endpoint = self.url
@ -213,6 +215,7 @@ class Loader:
url=self.kwargs.get("TIKA_SERVER_URL"), url=self.kwargs.get("TIKA_SERVER_URL"),
file_path=file_path, file_path=file_path,
mime_type=file_content_type, mime_type=file_content_type,
extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
) )
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"): elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
if self._is_text_file(file_ext, file_content_type): if self._is_text_file(file_ext, file_content_type):