mirror of
https://github.com/open-webui/open-webui
synced 2025-05-21 13:36:35 +00:00
fix: fix DoclingLoader input params
This commit is contained in:
parent
2419ef06a0
commit
a44b35e99e
@ -126,24 +126,43 @@ class DoclingLoader:
|
|||||||
raise ValueError("File path is required for DoclingLoader")
|
raise ValueError("File path is required for DoclingLoader")
|
||||||
|
|
||||||
with open(self.file_path, "rb") as f:
|
with open(self.file_path, "rb") as f:
|
||||||
files = {"files": (self.file_path, f, self.mime_type or "application/octet-stream")}
|
files = {
|
||||||
|
"files": (
|
||||||
|
self.file_path,
|
||||||
|
f,
|
||||||
|
self.mime_type or "application/octet-stream",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"from_formats": ["docx", "pptx", "html", "xml_pubmed", "image", "pdf", "asciidoc", "md", "xlsx", "xml_uspto", "json_docling"],
|
"from_formats": [
|
||||||
"to_formats": ["md"],
|
"docx",
|
||||||
"image_export_mode": "placeholder",
|
"pptx",
|
||||||
"do_ocr": True,
|
"html",
|
||||||
"force_ocr": False,
|
"image",
|
||||||
"ocr_engine": "easyocr",
|
"pdf",
|
||||||
"ocr_lang": None,
|
"asciidoc",
|
||||||
"pdf_backend": "dlparse_v2",
|
"md",
|
||||||
"table_mode": "fast",
|
"csv",
|
||||||
"abort_on_error": False,
|
"xlsx",
|
||||||
"return_as_file": False,
|
"xml_uspto",
|
||||||
"do_table_structure": True,
|
"xml_jats",
|
||||||
"include_images": True,
|
"json_docling",
|
||||||
"images_scale": 2.0,
|
],
|
||||||
}
|
"to_formats": ["md"],
|
||||||
|
"image_export_mode": "placeholder",
|
||||||
|
"do_ocr": True,
|
||||||
|
"force_ocr": False,
|
||||||
|
"ocr_engine": "easyocr",
|
||||||
|
"ocr_lang": None,
|
||||||
|
"pdf_backend": "dlparse_v2",
|
||||||
|
"table_mode": "accurate",
|
||||||
|
"abort_on_error": False,
|
||||||
|
"return_as_file": False,
|
||||||
|
"do_table_structure": True,
|
||||||
|
"include_images": True,
|
||||||
|
"images_scale": 2.0,
|
||||||
|
}
|
||||||
|
|
||||||
endpoint = f"{self.url}/v1alpha/convert/file"
|
endpoint = f"{self.url}/v1alpha/convert/file"
|
||||||
response = requests.post(endpoint, files=files, data=params)
|
response = requests.post(endpoint, files=files, data=params)
|
||||||
@ -154,7 +173,7 @@ class DoclingLoader:
|
|||||||
text = document_data.get("md_content", "<No text content found>")
|
text = document_data.get("md_content", "<No text content found>")
|
||||||
|
|
||||||
metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
|
metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
|
||||||
|
|
||||||
log.debug("Docling extracted text: %s", text)
|
log.debug("Docling extracted text: %s", text)
|
||||||
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
return [Document(page_content=text, metadata=metadata)]
|
||||||
|
Loading…
Reference in New Issue
Block a user