From 8644e81a1c352c127d3e1decef6fdc389c6f81dc Mon Sep 17 00:00:00 2001 From: Vaclav Cerny Date: Wed, 4 Jun 2025 12:34:39 +0200 Subject: [PATCH 1/4] feat(loader): add picture description configuration for DoclingLoader --- backend/open_webui/retrieval/loaders/main.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 0c7daf905..c4730ea70 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -294,6 +294,12 @@ class Loader: "do_picture_classification": self.kwargs.get( "DOCLING_DO_PICTURE_DESCRIPTION" ), + "picture_description_local": ( + '{\n' + ' "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",\n' + ' "prompt": "Describe the image in detail, including objects, actions, and connections. Use a descriptive and informative style."\n' + '}' + ) }, ) elif ( From c4278f4784e31a0f0819478292864e5111faf452 Mon Sep 17 00:00:00 2001 From: Vaclav Cerny Date: Wed, 4 Jun 2025 14:13:00 +0200 Subject: [PATCH 2/4] fix description vs classification mismatch --- backend/open_webui/retrieval/loaders/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index c4730ea70..817d9ddbc 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -148,13 +148,13 @@ class DoclingLoader: params = { "image_export_mode": "placeholder", - "table_mode": "accurate", + "table_mode": "accurate" } if self.params: - if self.params.get("do_picture_classification"): - params["do_picture_classification"] = self.params.get( - "do_picture_classification" + if self.params.get("do_picture_description"): + params["do_picture_description"] = self.params.get( + "do_picture_description" ) if self.params.get("ocr_engine") and self.params.get("ocr_lang"): @@ -291,7 +291,7 @@ class Loader: params={ "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"), "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"), - "do_picture_classification": self.kwargs.get( + "do_picture_description": self.kwargs.get( "DOCLING_DO_PICTURE_DESCRIPTION" ), "picture_description_local": ( From c71236ba07fbed0abeb132b5d1ba65cadf050079 Mon Sep 17 00:00:00 2001 From: Vaclav Cerny Date: Wed, 4 Jun 2025 14:25:31 +0200 Subject: [PATCH 3/4] feat(loader): enhance picture description prompt for improved detail and clarity --- backend/open_webui/retrieval/loaders/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 817d9ddbc..0dc530c96 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -297,7 +297,7 @@ class Loader: "picture_description_local": ( '{\n' ' "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",\n' - ' "prompt": "Describe the image in detail, including objects, actions, and connections. Use a descriptive and informative style."\n' + ' "prompt": "Analyze the image and provide a comprehensive, detailed description. Identify all visible objects, their attributes, actions taking place, spatial relationships, and any contextual or inferred connections. Use clear, structured, and informative language suitable for downstream retrieval or knowledge extraction tasks."\n' '}' ) }, From 9772c18b20d826396b3e600452726f4aafc64b7c Mon Sep 17 00:00:00 2001 From: Vaclav Cerny Date: Wed, 4 Jun 2025 17:21:44 +0200 Subject: [PATCH 4/4] fix(loader): remove deprecated picture description configuration --- backend/open_webui/retrieval/loaders/main.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 0dc530c96..73e061f75 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -293,14 +293,8 @@ class Loader: "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"), "do_picture_description": self.kwargs.get( "DOCLING_DO_PICTURE_DESCRIPTION" - ), - "picture_description_local": ( - '{\n' - ' "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",\n' - ' "prompt": "Analyze the image and provide a comprehensive, detailed description. Identify all visible objects, their attributes, actions taking place, spatial relationships, and any contextual or inferred connections. Use clear, structured, and informative language suitable for downstream retrieval or knowledge extraction tasks."\n' - '}' ) - }, + } ) elif ( self.engine == "document_intelligence"