feat: docling do picture description support

This commit is contained in:
Timothy Jaeryang Baek 2025-05-14 21:26:49 +04:00
parent 6e8481e157
commit de70d0cb64
5 changed files with 70 additions and 29 deletions

View File

@ -1849,6 +1849,12 @@ DOCLING_OCR_LANG = PersistentConfig(
os.getenv("DOCLING_OCR_LANG", "eng,fra,deu,spa"), os.getenv("DOCLING_OCR_LANG", "eng,fra,deu,spa"),
) )
DOCLING_DO_PICTURE_DESCRIPTION = PersistentConfig(
"DOCLING_DO_PICTURE_DESCRIPTION",
"rag.docling_do_picture_description",
os.getenv("DOCLING_DO_PICTURE_DESCRIPTION", "False").lower() == "true",
)
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
"DOCUMENT_INTELLIGENCE_ENDPOINT", "DOCUMENT_INTELLIGENCE_ENDPOINT",
"rag.document_intelligence_endpoint", "rag.document_intelligence_endpoint",

View File

@ -210,6 +210,7 @@ from open_webui.config import (
DOCLING_SERVER_URL, DOCLING_SERVER_URL,
DOCLING_OCR_ENGINE, DOCLING_OCR_ENGINE,
DOCLING_OCR_LANG, DOCLING_OCR_LANG,
DOCLING_DO_PICTURE_DESCRIPTION,
DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY, DOCUMENT_INTELLIGENCE_KEY,
MISTRAL_OCR_API_KEY, MISTRAL_OCR_API_KEY,
@ -645,6 +646,7 @@ app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE
app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG
app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY

View File

@ -126,14 +126,12 @@ class TikaLoader:
class DoclingLoader: class DoclingLoader:
def __init__( def __init__(self, url, file_path=None, mime_type=None, params=None):
self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None
):
self.url = url.rstrip("/") self.url = url.rstrip("/")
self.file_path = file_path self.file_path = file_path
self.mime_type = mime_type self.mime_type = mime_type
self.ocr_engine = ocr_engine
self.ocr_lang = ocr_lang self.params = params or {}
def load(self) -> list[Document]: def load(self) -> list[Document]:
with open(self.file_path, "rb") as f: with open(self.file_path, "rb") as f:
@ -150,10 +148,18 @@ class DoclingLoader:
"table_mode": "accurate", "table_mode": "accurate",
} }
if self.ocr_engine and self.ocr_lang: if self.params:
params["ocr_engine"] = self.ocr_engine if self.params.get("do_picture_classification"):
params["do_picture_classification"] = self.params.get(
"do_picture_classification"
)
if self.params.get("ocr_engine") and self.params.get("ocr_lang"):
params["ocr_engine"] = self.params.get("ocr_engine")
params["ocr_lang"] = [ params["ocr_lang"] = [
lang.strip() for lang in self.ocr_lang.split(",") if lang.strip() lang.strip()
for lang in self.params.get("ocr_lang").split(",")
if lang.strip()
] ]
endpoint = f"{self.url}/v1alpha/convert/file" endpoint = f"{self.url}/v1alpha/convert/file"
@ -225,8 +231,13 @@ class Loader:
url=self.kwargs.get("DOCLING_SERVER_URL"), url=self.kwargs.get("DOCLING_SERVER_URL"),
file_path=file_path, file_path=file_path,
mime_type=file_content_type, mime_type=file_content_type,
ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"), params={
ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"), "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"),
"ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"),
"do_picture_classification": self.kwargs.get(
"DOCLING_DO_PICTURE_DESCRIPTION"
),
},
) )
elif ( elif (
self.engine == "document_intelligence" self.engine == "document_intelligence"

View File

@ -356,6 +356,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
"DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
"DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
"DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
@ -496,6 +497,7 @@ class ConfigForm(BaseModel):
DOCLING_SERVER_URL: Optional[str] = None DOCLING_SERVER_URL: Optional[str] = None
DOCLING_OCR_ENGINE: Optional[str] = None DOCLING_OCR_ENGINE: Optional[str] = None
DOCLING_OCR_LANG: Optional[str] = None DOCLING_OCR_LANG: Optional[str] = None
DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
MISTRAL_OCR_API_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None
@ -601,6 +603,13 @@ async def update_rag_config(
if form_data.DOCLING_OCR_LANG is not None if form_data.DOCLING_OCR_LANG is not None
else request.app.state.config.DOCLING_OCR_LANG else request.app.state.config.DOCLING_OCR_LANG
) )
request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = (
form_data.DOCLING_DO_PICTURE_DESCRIPTION
if form_data.DOCLING_DO_PICTURE_DESCRIPTION is not None
else request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION
)
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
form_data.DOCUMENT_INTELLIGENCE_ENDPOINT form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
@ -813,6 +822,7 @@ async def update_rag_config(
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
"DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
"DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
"DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
@ -1133,6 +1143,7 @@ def process_file(
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE, DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE,
DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG, DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG,
DOCLING_DO_PICTURE_DESCRIPTION=request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,

View File

@ -246,7 +246,7 @@
<hr class=" border-gray-100 dark:border-gray-850 my-2" /> <hr class=" border-gray-100 dark:border-gray-850 my-2" />
<div class="mb-2.5 flex flex-col w-full justify-between"> <div class="mb-2.5 flex flex-col w-full justify-between">
<div class="flex w-full justify-between"> <div class="flex w-full justify-between mb-1">
<div class="self-center text-xs font-medium"> <div class="self-center text-xs font-medium">
{$i18n.t('Content Extraction Engine')} {$i18n.t('Content Extraction Engine')}
</div> </div>
@ -279,7 +279,7 @@
<div class="flex w-full mt-1"> <div class="flex w-full mt-1">
<div class="flex-1 mr-2"> <div class="flex-1 mr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Tika Server URL')} placeholder={$i18n.t('Enter Tika Server URL')}
bind:value={RAGConfig.TIKA_SERVER_URL} bind:value={RAGConfig.TIKA_SERVER_URL}
/> />
@ -288,27 +288,38 @@
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling'} {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling'}
<div class="flex w-full mt-1"> <div class="flex w-full mt-1">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Docling Server URL')} placeholder={$i18n.t('Enter Docling Server URL')}
bind:value={RAGConfig.DOCLING_SERVER_URL} bind:value={RAGConfig.DOCLING_SERVER_URL}
/> />
</div> </div>
<div class="flex w-full mt-2"> <div class="flex w-full mt-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Docling OCR Engine')} placeholder={$i18n.t('Enter Docling OCR Engine')}
bind:value={RAGConfig.DOCLING_OCR_ENGINE} bind:value={RAGConfig.DOCLING_OCR_ENGINE}
/> />
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Docling OCR Language(s)')} placeholder={$i18n.t('Enter Docling OCR Language(s)')}
bind:value={RAGConfig.DOCLING_OCR_LANG} bind:value={RAGConfig.DOCLING_OCR_LANG}
/> />
</div> </div>
<div class="flex w-full mt-2">
<div class="flex-1 flex justify-between">
<div class=" self-center text-xs font-medium">
{$i18n.t('Describe Pictures in Documents')}
</div>
<div class="flex items-center relative">
<Switch bind:state={RAGConfig.DOCLING_DO_PICTURE_DESCRIPTION} />
</div>
</div>
</div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'} {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'}
<div class="my-0.5 flex gap-2 pr-2"> <div class="my-0.5 flex gap-2 pr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Document Intelligence Endpoint')} placeholder={$i18n.t('Enter Document Intelligence Endpoint')}
bind:value={RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT} bind:value={RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT}
/> />
@ -437,7 +448,7 @@
{#if embeddingEngine === 'openai'} {#if embeddingEngine === 'openai'}
<div class="my-0.5 flex gap-2 pr-2"> <div class="my-0.5 flex gap-2 pr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('API Base URL')} placeholder={$i18n.t('API Base URL')}
bind:value={OpenAIUrl} bind:value={OpenAIUrl}
required required
@ -448,7 +459,7 @@
{:else if embeddingEngine === 'ollama'} {:else if embeddingEngine === 'ollama'}
<div class="my-0.5 flex gap-2 pr-2"> <div class="my-0.5 flex gap-2 pr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('API Base URL')} placeholder={$i18n.t('API Base URL')}
bind:value={OllamaUrl} bind:value={OllamaUrl}
required required
@ -471,7 +482,7 @@
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1 mr-2"> <div class="flex-1 mr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
bind:value={embeddingModel} bind:value={embeddingModel}
placeholder={$i18n.t('Set embedding model')} placeholder={$i18n.t('Set embedding model')}
required required
@ -482,7 +493,7 @@
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1 mr-2"> <div class="flex-1 mr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Set embedding model (e.g. {{model}})', { placeholder={$i18n.t('Set embedding model (e.g. {{model}})', {
model: embeddingModel.slice(-40) model: embeddingModel.slice(-40)
})} })}
@ -639,7 +650,7 @@
{#if RAGConfig.RAG_RERANKING_ENGINE === 'external'} {#if RAGConfig.RAG_RERANKING_ENGINE === 'external'}
<div class="my-0.5 flex gap-2 pr-2"> <div class="my-0.5 flex gap-2 pr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('API Base URL')} placeholder={$i18n.t('API Base URL')}
bind:value={RAGConfig.RAG_EXTERNAL_RERANKER_URL} bind:value={RAGConfig.RAG_EXTERNAL_RERANKER_URL}
required required
@ -661,7 +672,7 @@
<div class="flex w-full"> <div class="flex w-full">
<div class="flex-1 mr-2"> <div class="flex-1 mr-2">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Set reranking model (e.g. {{model}})', { placeholder={$i18n.t('Set reranking model (e.g. {{model}})', {
model: 'BAAI/bge-reranker-v2-m3' model: 'BAAI/bge-reranker-v2-m3'
})} })}
@ -677,7 +688,7 @@
<div class=" self-center text-xs font-medium">{$i18n.t('Top K')}</div> <div class=" self-center text-xs font-medium">{$i18n.t('Top K')}</div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
type="number" type="number"
placeholder={$i18n.t('Enter Top K')} placeholder={$i18n.t('Enter Top K')}
bind:value={RAGConfig.TOP_K} bind:value={RAGConfig.TOP_K}
@ -692,7 +703,7 @@
<div class="self-center text-xs font-medium">{$i18n.t('Top K Reranker')}</div> <div class="self-center text-xs font-medium">{$i18n.t('Top K Reranker')}</div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
type="number" type="number"
placeholder={$i18n.t('Enter Top K Reranker')} placeholder={$i18n.t('Enter Top K Reranker')}
bind:value={RAGConfig.TOP_K_RERANKER} bind:value={RAGConfig.TOP_K_RERANKER}
@ -711,7 +722,7 @@
</div> </div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
type="number" type="number"
step="0.01" step="0.01"
placeholder={$i18n.t('Enter Score')} placeholder={$i18n.t('Enter Score')}
@ -770,7 +781,7 @@
placement="top-start" placement="top-start"
> >
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
type="number" type="number"
placeholder={$i18n.t('Leave empty for unlimited')} placeholder={$i18n.t('Leave empty for unlimited')}
bind:value={RAGConfig.FILE_MAX_SIZE} bind:value={RAGConfig.FILE_MAX_SIZE}
@ -791,7 +802,7 @@
placement="top-start" placement="top-start"
> >
<input <input
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" class="flex-1 w-full text-sm bg-transparent outline-hidden"
type="number" type="number"
placeholder={$i18n.t('Leave empty for unlimited')} placeholder={$i18n.t('Leave empty for unlimited')}
bind:value={RAGConfig.FILE_MAX_COUNT} bind:value={RAGConfig.FILE_MAX_COUNT}