diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index ab5bd0355..e902f4324 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1968,6 +1968,54 @@ DOCLING_DO_PICTURE_DESCRIPTION = PersistentConfig( os.getenv("DOCLING_DO_PICTURE_DESCRIPTION", "False").lower() == "true", ) +DOCLING_PICTURE_DESCRIPTION_MODE = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_MODE", + "rag.docling_picture_description_mode", + os.getenv("DOCLING_PICTURE_DESCRIPTION_MODE", ""), +) + +DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID", + "rag.docling_picture_description_local_repo_id", + os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID", "HuggingFaceTB/SmolVLM-256M-Instruct"), +) + +DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS", + "rag.docling_picture_description_local_max_tokens", + int(os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS", 200)), +) + +DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT", + "rag.docling_picture_description_local_prompt", + os.getenv( + "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT", + "Describe this image in a few sentences.", + ) +) + +DOCLING_PICTURE_DESCRIPTION_API_URL = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_API_URL", + "rag.docling_picture_description_api_url", + os.getenv("DOCLING_PICTURE_DESCRIPTION_API_URL", ""), +) + +DOCLING_PICTURE_DESCRIPTION_API_MODEL = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_API_MODEL", + "rag.docling_picture_description_api_model", + os.getenv("DOCLING_PICTURE_DESCRIPTION_API_MODEL", ""), +) + +DOCLING_PICTURE_DESCRIPTION_API_PROMPT = PersistentConfig( + "DOCLING_PICTURE_DESCRIPTION_API_PROMPT", + "rag.docling_picture_description_api_prompt", + os.getenv( + "DOCLING_PICTURE_DESCRIPTION_API_PROMPT", + "Describe this image in a few sentences.", + ) +) + DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( "DOCUMENT_INTELLIGENCE_ENDPOINT", "rag.document_intelligence_endpoint", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 02d5b0d01..74acdefde 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -231,6 +231,13 @@ from open_webui.config import ( DOCLING_OCR_ENGINE, DOCLING_OCR_LANG, DOCLING_DO_PICTURE_DESCRIPTION, + DOCLING_PICTURE_DESCRIPTION_MODE, + DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, + DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, + DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT, + DOCLING_PICTURE_DESCRIPTION_API_URL, + DOCLING_PICTURE_DESCRIPTION_API_MODEL, + DOCLING_PICTURE_DESCRIPTION_API_PROMPT, DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY, MISTRAL_OCR_API_KEY, @@ -701,6 +708,13 @@ app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION +app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE = DOCLING_PICTURE_DESCRIPTION_MODE +app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID +app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS +app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT +app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL = DOCLING_PICTURE_DESCRIPTION_API_URL +app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL = DOCLING_PICTURE_DESCRIPTION_API_MODEL +app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT = DOCLING_PICTURE_DESCRIPTION_API_PROMPT app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 103a9dc93..919d43da0 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -2,6 +2,7 @@ import requests import logging import ftfy import sys +import json from langchain_community.document_loaders import ( AzureAIDocumentIntelligenceLoader, @@ -154,6 +155,41 @@ class DoclingLoader: "do_picture_description" ) + picture_description_mode = self.params.get("picture_description_mode", "").lower() + + if picture_description_mode == "local": + + params["picture_description_local"] = json.dumps({ + "repo_id": self.params.get( + "picture_description_local_repo_id", "HuggingFaceTB/SmolVLM-256M-Instruct" + ), + "generation_config": { + "max_new_tokens": self.params.get( + "picture_description_local_max_tokens", 200 + ) + }, + "prompt": self.params.get( + "picture_description_local_prompt", "Describe this image in a few sentences." + ) + }) + + elif picture_description_mode == "api": + + params["picture_description_api"] = json.dumps({ + "url": self.params.get( + "picture_description_api_url", "" + ), + "params": { + "model": self.params.get( + "picture_description_api_model", "" + ) + }, + "timeout": 30, + "prompt": self.params.get( + "picture_description_api_prompt", "Describe this image in a few sentences." + ) + }) + if self.params.get("ocr_engine") and self.params.get("ocr_lang"): params["ocr_engine"] = self.params.get("ocr_engine") params["ocr_lang"] = [ @@ -281,17 +317,25 @@ class Loader: if self._is_text_file(file_ext, file_content_type): loader = TextLoader(file_path, autodetect_encoding=True) else: + # Build params for DoclingLoader + params = { + "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"), + "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"), + "do_picture_description": self.kwargs.get("DOCLING_DO_PICTURE_DESCRIPTION"), + "picture_description_mode": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_MODE"), + "picture_description_local_repo_id": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID"), + "picture_description_local_max_tokens": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS"), + "picture_description_local_prompt": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT"), + "picture_description_api_url": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_URL"), + "picture_description_api_model": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_MODEL"), + "picture_description_api_prompt": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_PROMPT") + } + loader = DoclingLoader( url=self.kwargs.get("DOCLING_SERVER_URL"), file_path=file_path, mime_type=file_content_type, - params={ - "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"), - "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"), - "do_picture_description": self.kwargs.get( - "DOCLING_DO_PICTURE_DESCRIPTION" - ), - }, + params=params ) elif ( self.engine == "document_intelligence" diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 22b264bfa..af71bdead 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -414,6 +414,13 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, + "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE, + "DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, + "DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, + "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT, + "DOCLING_PICTURE_DESCRIPTION_API_URL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL, + "DOCLING_PICTURE_DESCRIPTION_API_MODEL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL, + "DOCLING_PICTURE_DESCRIPTION_API_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, @@ -575,6 +582,13 @@ class ConfigForm(BaseModel): DOCLING_OCR_ENGINE: Optional[str] = None DOCLING_OCR_LANG: Optional[str] = None DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None + DOCLING_PICTURE_DESCRIPTION_MODE: Optional[str] = None + DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID: Optional[str] = None + DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS: Optional[int] = None + DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT: Optional[str] = None + DOCLING_PICTURE_DESCRIPTION_API_URL: Optional[str] = None + DOCLING_PICTURE_DESCRIPTION_API_MODEL: Optional[str] = None + DOCLING_PICTURE_DESCRIPTION_API_PROMPT: Optional[str] = None DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None @@ -748,6 +762,42 @@ async def update_rag_config( else request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE = ( + form_data.DOCLING_PICTURE_DESCRIPTION_MODE + if form_data.DOCLING_PICTURE_DESCRIPTION_MODE is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE + ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = ( + form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID + if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID + ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = ( + form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS + if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS + ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = ( + form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT + if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT + ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL = ( + form_data.DOCLING_PICTURE_DESCRIPTION_API_URL + if form_data.DOCLING_PICTURE_DESCRIPTION_API_URL is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL + ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL = ( + form_data.DOCLING_PICTURE_DESCRIPTION_API_MODEL + if form_data.DOCLING_PICTURE_DESCRIPTION_API_MODEL is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL + ) + request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT = ( + form_data.DOCLING_PICTURE_DESCRIPTION_API_PROMPT + if form_data.DOCLING_PICTURE_DESCRIPTION_API_PROMPT is not None + else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT + ) + request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( form_data.DOCUMENT_INTELLIGENCE_ENDPOINT if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None @@ -985,6 +1035,13 @@ async def update_rag_config( "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, + "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE, + "DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, + "DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, + "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT, + "DOCLING_PICTURE_DESCRIPTION_API_URL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL, + "DOCLING_PICTURE_DESCRIPTION_API_MODEL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL, + "DOCLING_PICTURE_DESCRIPTION_API_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, @@ -1334,6 +1391,13 @@ def process_file( DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE, DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG, DOCLING_DO_PICTURE_DESCRIPTION=request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, + DOCLING_PICTURE_DESCRIPTION_MODE=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE, + DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, + DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, + DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT, + DOCLING_PICTURE_DESCRIPTION_API_URL=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL, + DOCLING_PICTURE_DESCRIPTION_API_MODEL=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL, + DOCLING_PICTURE_DESCRIPTION_API_PROMPT=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 5572be4fe..081ee7aeb 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -510,6 +510,140 @@ + {#if RAGConfig.DOCLING_DO_PICTURE_DESCRIPTION} +