fix: fix DoclingLoader input params

This commit is contained in:
Fabio Polito 2025-03-05 17:53:45 +00:00
parent 2419ef06a0
commit a44b35e99e

View File

@ -126,24 +126,43 @@ class DoclingLoader:
raise ValueError("File path is required for DoclingLoader") raise ValueError("File path is required for DoclingLoader")
with open(self.file_path, "rb") as f: with open(self.file_path, "rb") as f:
files = {"files": (self.file_path, f, self.mime_type or "application/octet-stream")} files = {
"files": (
self.file_path,
f,
self.mime_type or "application/octet-stream",
)
}
params = { params = {
"from_formats": ["docx", "pptx", "html", "xml_pubmed", "image", "pdf", "asciidoc", "md", "xlsx", "xml_uspto", "json_docling"], "from_formats": [
"to_formats": ["md"], "docx",
"image_export_mode": "placeholder", "pptx",
"do_ocr": True, "html",
"force_ocr": False, "image",
"ocr_engine": "easyocr", "pdf",
"ocr_lang": None, "asciidoc",
"pdf_backend": "dlparse_v2", "md",
"table_mode": "fast", "csv",
"abort_on_error": False, "xlsx",
"return_as_file": False, "xml_uspto",
"do_table_structure": True, "xml_jats",
"include_images": True, "json_docling",
"images_scale": 2.0, ],
} "to_formats": ["md"],
"image_export_mode": "placeholder",
"do_ocr": True,
"force_ocr": False,
"ocr_engine": "easyocr",
"ocr_lang": None,
"pdf_backend": "dlparse_v2",
"table_mode": "accurate",
"abort_on_error": False,
"return_as_file": False,
"do_table_structure": True,
"include_images": True,
"images_scale": 2.0,
}
endpoint = f"{self.url}/v1alpha/convert/file" endpoint = f"{self.url}/v1alpha/convert/file"
response = requests.post(endpoint, files=files, data=params) response = requests.post(endpoint, files=files, data=params)
@ -154,7 +173,7 @@ class DoclingLoader:
text = document_data.get("md_content", "<No text content found>") text = document_data.get("md_content", "<No text content found>")
metadata = {"Content-Type": self.mime_type} if self.mime_type else {} metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
log.debug("Docling extracted text: %s", text) log.debug("Docling extracted text: %s", text)
return [Document(page_content=text, metadata=metadata)] return [Document(page_content=text, metadata=metadata)]