fix: handle json output format correctly

This commit is contained in:
Hisma 2025-05-27 01:12:03 -04:00
parent a9405cc101
commit e12a79c0e2

View File

@ -92,13 +92,7 @@ class DatalabMarkerLoader:
"output_format": self.output_format,
}
request_params = {
"filename": filename,
"mime_type": mime_type,
**form_data,
}
log.info(f"Datalab Marker POST request parameters: {request_params}")
log.info(f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}")
try:
with open(self.file_path, "rb") as f:
@ -138,7 +132,6 @@ class DatalabMarkerLoader:
success_val = poll_result.get("success")
if status_val == "complete":
# Log key details
summary = {
k: poll_result.get(k)
for k in ("status", "output_format", "success", "error", "page_count", "total_cost")
@ -158,11 +151,18 @@ class DatalabMarkerLoader:
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=f"Final processing failed: {error_msg}")
content_key = self.output_format.lower()
full_text = poll_result.get(content_key, "").strip()
raw_content = poll_result.get(content_key)
if content_key == "json":
full_text = json.dumps(raw_content, indent=2)
elif content_key in {"markdown", "html"}:
full_text = str(raw_content).strip()
else:
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=f"Unsupported output format: {self.output_format}")
if not full_text:
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail="Datalab Marker returned empty content")
# Write output to uploads/marker_output
marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
os.makedirs(marker_output_dir, exist_ok=True)
@ -197,4 +197,4 @@ class DatalabMarkerLoader:
elif v is None:
metadata[k] = ""
return [Document(page_content=full_text, metadata=metadata)]
return [Document(page_content=full_text, metadata=metadata)]