mirror of
				https://github.com/open-webui/open-webui
				synced 2025-06-26 18:26:48 +00:00 
			
		
		
		
	feat: docling do picture description support
This commit is contained in:
		
							parent
							
								
									6e8481e157
								
							
						
					
					
						commit
						de70d0cb64
					
				@ -1849,6 +1849,12 @@ DOCLING_OCR_LANG = PersistentConfig(
 | 
			
		||||
    os.getenv("DOCLING_OCR_LANG", "eng,fra,deu,spa"),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
DOCLING_DO_PICTURE_DESCRIPTION = PersistentConfig(
 | 
			
		||||
    "DOCLING_DO_PICTURE_DESCRIPTION",
 | 
			
		||||
    "rag.docling_do_picture_description",
 | 
			
		||||
    os.getenv("DOCLING_DO_PICTURE_DESCRIPTION", "False").lower() == "true",
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
 | 
			
		||||
    "DOCUMENT_INTELLIGENCE_ENDPOINT",
 | 
			
		||||
    "rag.document_intelligence_endpoint",
 | 
			
		||||
 | 
			
		||||
@ -210,6 +210,7 @@ from open_webui.config import (
 | 
			
		||||
    DOCLING_SERVER_URL,
 | 
			
		||||
    DOCLING_OCR_ENGINE,
 | 
			
		||||
    DOCLING_OCR_LANG,
 | 
			
		||||
    DOCLING_DO_PICTURE_DESCRIPTION,
 | 
			
		||||
    DOCUMENT_INTELLIGENCE_ENDPOINT,
 | 
			
		||||
    DOCUMENT_INTELLIGENCE_KEY,
 | 
			
		||||
    MISTRAL_OCR_API_KEY,
 | 
			
		||||
@ -645,6 +646,7 @@ app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
 | 
			
		||||
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
 | 
			
		||||
app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE
 | 
			
		||||
app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG
 | 
			
		||||
app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION
 | 
			
		||||
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
 | 
			
		||||
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
 | 
			
		||||
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
 | 
			
		||||
 | 
			
		||||
@ -126,14 +126,12 @@ class TikaLoader:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DoclingLoader:
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None
 | 
			
		||||
    ):
 | 
			
		||||
    def __init__(self, url, file_path=None, mime_type=None, params=None):
 | 
			
		||||
        self.url = url.rstrip("/")
 | 
			
		||||
        self.file_path = file_path
 | 
			
		||||
        self.mime_type = mime_type
 | 
			
		||||
        self.ocr_engine = ocr_engine
 | 
			
		||||
        self.ocr_lang = ocr_lang
 | 
			
		||||
 | 
			
		||||
        self.params = params or {}
 | 
			
		||||
 | 
			
		||||
    def load(self) -> list[Document]:
 | 
			
		||||
        with open(self.file_path, "rb") as f:
 | 
			
		||||
@ -150,11 +148,19 @@ class DoclingLoader:
 | 
			
		||||
                "table_mode": "accurate",
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if self.ocr_engine and self.ocr_lang:
 | 
			
		||||
                params["ocr_engine"] = self.ocr_engine
 | 
			
		||||
                params["ocr_lang"] = [
 | 
			
		||||
                    lang.strip() for lang in self.ocr_lang.split(",") if lang.strip()
 | 
			
		||||
                ]
 | 
			
		||||
            if self.params:
 | 
			
		||||
                if self.params.get("do_picture_classification"):
 | 
			
		||||
                    params["do_picture_classification"] = self.params.get(
 | 
			
		||||
                        "do_picture_classification"
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                if self.params.get("ocr_engine") and self.params.get("ocr_lang"):
 | 
			
		||||
                    params["ocr_engine"] = self.params.get("ocr_engine")
 | 
			
		||||
                    params["ocr_lang"] = [
 | 
			
		||||
                        lang.strip()
 | 
			
		||||
                        for lang in self.params.get("ocr_lang").split(",")
 | 
			
		||||
                        if lang.strip()
 | 
			
		||||
                    ]
 | 
			
		||||
 | 
			
		||||
            endpoint = f"{self.url}/v1alpha/convert/file"
 | 
			
		||||
            r = requests.post(endpoint, files=files, data=params)
 | 
			
		||||
@ -225,8 +231,13 @@ class Loader:
 | 
			
		||||
                    url=self.kwargs.get("DOCLING_SERVER_URL"),
 | 
			
		||||
                    file_path=file_path,
 | 
			
		||||
                    mime_type=file_content_type,
 | 
			
		||||
                    ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"),
 | 
			
		||||
                    ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"),
 | 
			
		||||
                    params={
 | 
			
		||||
                        "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"),
 | 
			
		||||
                        "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"),
 | 
			
		||||
                        "do_picture_classification": self.kwargs.get(
 | 
			
		||||
                            "DOCLING_DO_PICTURE_DESCRIPTION"
 | 
			
		||||
                        ),
 | 
			
		||||
                    },
 | 
			
		||||
                )
 | 
			
		||||
        elif (
 | 
			
		||||
            self.engine == "document_intelligence"
 | 
			
		||||
 | 
			
		||||
@ -356,6 +356,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
 | 
			
		||||
        "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
 | 
			
		||||
        "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
 | 
			
		||||
        "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
 | 
			
		||||
        "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
 | 
			
		||||
        "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
 | 
			
		||||
        "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
 | 
			
		||||
        "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
 | 
			
		||||
@ -496,6 +497,7 @@ class ConfigForm(BaseModel):
 | 
			
		||||
    DOCLING_SERVER_URL: Optional[str] = None
 | 
			
		||||
    DOCLING_OCR_ENGINE: Optional[str] = None
 | 
			
		||||
    DOCLING_OCR_LANG: Optional[str] = None
 | 
			
		||||
    DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None
 | 
			
		||||
    DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
 | 
			
		||||
    DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
 | 
			
		||||
    MISTRAL_OCR_API_KEY: Optional[str] = None
 | 
			
		||||
@ -601,6 +603,13 @@ async def update_rag_config(
 | 
			
		||||
        if form_data.DOCLING_OCR_LANG is not None
 | 
			
		||||
        else request.app.state.config.DOCLING_OCR_LANG
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = (
 | 
			
		||||
        form_data.DOCLING_DO_PICTURE_DESCRIPTION
 | 
			
		||||
        if form_data.DOCLING_DO_PICTURE_DESCRIPTION is not None
 | 
			
		||||
        else request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
 | 
			
		||||
        form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
 | 
			
		||||
        if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
 | 
			
		||||
@ -813,6 +822,7 @@ async def update_rag_config(
 | 
			
		||||
        "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
 | 
			
		||||
        "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
 | 
			
		||||
        "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
 | 
			
		||||
        "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
 | 
			
		||||
        "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
 | 
			
		||||
        "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
 | 
			
		||||
        "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
 | 
			
		||||
@ -1133,6 +1143,7 @@ def process_file(
 | 
			
		||||
                    DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
 | 
			
		||||
                    DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE,
 | 
			
		||||
                    DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG,
 | 
			
		||||
                    DOCLING_DO_PICTURE_DESCRIPTION=request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
 | 
			
		||||
                    PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
 | 
			
		||||
                    DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
 | 
			
		||||
                    DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
 | 
			
		||||
 | 
			
		||||
@ -246,7 +246,7 @@
 | 
			
		||||
					<hr class=" border-gray-100 dark:border-gray-850 my-2" />
 | 
			
		||||
 | 
			
		||||
					<div class="mb-2.5 flex flex-col w-full justify-between">
 | 
			
		||||
						<div class="flex w-full justify-between">
 | 
			
		||||
						<div class="flex w-full justify-between mb-1">
 | 
			
		||||
							<div class="self-center text-xs font-medium">
 | 
			
		||||
								{$i18n.t('Content Extraction Engine')}
 | 
			
		||||
							</div>
 | 
			
		||||
@ -279,7 +279,7 @@
 | 
			
		||||
							<div class="flex w-full mt-1">
 | 
			
		||||
								<div class="flex-1 mr-2">
 | 
			
		||||
									<input
 | 
			
		||||
										class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
										class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
										placeholder={$i18n.t('Enter Tika Server URL')}
 | 
			
		||||
										bind:value={RAGConfig.TIKA_SERVER_URL}
 | 
			
		||||
									/>
 | 
			
		||||
@ -288,27 +288,38 @@
 | 
			
		||||
						{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling'}
 | 
			
		||||
							<div class="flex w-full mt-1">
 | 
			
		||||
								<input
 | 
			
		||||
									class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
									class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
									placeholder={$i18n.t('Enter Docling Server URL')}
 | 
			
		||||
									bind:value={RAGConfig.DOCLING_SERVER_URL}
 | 
			
		||||
								/>
 | 
			
		||||
							</div>
 | 
			
		||||
							<div class="flex w-full mt-2">
 | 
			
		||||
								<input
 | 
			
		||||
									class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
									class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
									placeholder={$i18n.t('Enter Docling OCR Engine')}
 | 
			
		||||
									bind:value={RAGConfig.DOCLING_OCR_ENGINE}
 | 
			
		||||
								/>
 | 
			
		||||
								<input
 | 
			
		||||
									class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
									class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
									placeholder={$i18n.t('Enter Docling OCR Language(s)')}
 | 
			
		||||
									bind:value={RAGConfig.DOCLING_OCR_LANG}
 | 
			
		||||
								/>
 | 
			
		||||
							</div>
 | 
			
		||||
 | 
			
		||||
							<div class="flex w-full mt-2">
 | 
			
		||||
								<div class="flex-1 flex justify-between">
 | 
			
		||||
									<div class=" self-center text-xs font-medium">
 | 
			
		||||
										{$i18n.t('Describe Pictures in Documents')}
 | 
			
		||||
									</div>
 | 
			
		||||
									<div class="flex items-center relative">
 | 
			
		||||
										<Switch bind:state={RAGConfig.DOCLING_DO_PICTURE_DESCRIPTION} />
 | 
			
		||||
									</div>
 | 
			
		||||
								</div>
 | 
			
		||||
							</div>
 | 
			
		||||
						{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'}
 | 
			
		||||
							<div class="my-0.5 flex gap-2 pr-2">
 | 
			
		||||
								<input
 | 
			
		||||
									class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
									class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
									placeholder={$i18n.t('Enter Document Intelligence Endpoint')}
 | 
			
		||||
									bind:value={RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT}
 | 
			
		||||
								/>
 | 
			
		||||
@ -437,7 +448,7 @@
 | 
			
		||||
							{#if embeddingEngine === 'openai'}
 | 
			
		||||
								<div class="my-0.5 flex gap-2 pr-2">
 | 
			
		||||
									<input
 | 
			
		||||
										class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
										class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
										placeholder={$i18n.t('API Base URL')}
 | 
			
		||||
										bind:value={OpenAIUrl}
 | 
			
		||||
										required
 | 
			
		||||
@ -448,7 +459,7 @@
 | 
			
		||||
							{:else if embeddingEngine === 'ollama'}
 | 
			
		||||
								<div class="my-0.5 flex gap-2 pr-2">
 | 
			
		||||
									<input
 | 
			
		||||
										class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
										class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
										placeholder={$i18n.t('API Base URL')}
 | 
			
		||||
										bind:value={OllamaUrl}
 | 
			
		||||
										required
 | 
			
		||||
@ -471,7 +482,7 @@
 | 
			
		||||
									<div class="flex w-full">
 | 
			
		||||
										<div class="flex-1 mr-2">
 | 
			
		||||
											<input
 | 
			
		||||
												class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
												class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
												bind:value={embeddingModel}
 | 
			
		||||
												placeholder={$i18n.t('Set embedding model')}
 | 
			
		||||
												required
 | 
			
		||||
@ -482,7 +493,7 @@
 | 
			
		||||
									<div class="flex w-full">
 | 
			
		||||
										<div class="flex-1 mr-2">
 | 
			
		||||
											<input
 | 
			
		||||
												class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
												class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
												placeholder={$i18n.t('Set embedding model (e.g. {{model}})', {
 | 
			
		||||
													model: embeddingModel.slice(-40)
 | 
			
		||||
												})}
 | 
			
		||||
@ -639,7 +650,7 @@
 | 
			
		||||
									{#if RAGConfig.RAG_RERANKING_ENGINE === 'external'}
 | 
			
		||||
										<div class="my-0.5 flex gap-2 pr-2">
 | 
			
		||||
											<input
 | 
			
		||||
												class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
												class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
												placeholder={$i18n.t('API Base URL')}
 | 
			
		||||
												bind:value={RAGConfig.RAG_EXTERNAL_RERANKER_URL}
 | 
			
		||||
												required
 | 
			
		||||
@ -661,7 +672,7 @@
 | 
			
		||||
										<div class="flex w-full">
 | 
			
		||||
											<div class="flex-1 mr-2">
 | 
			
		||||
												<input
 | 
			
		||||
													class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
													class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
													placeholder={$i18n.t('Set reranking model (e.g. {{model}})', {
 | 
			
		||||
														model: 'BAAI/bge-reranker-v2-m3'
 | 
			
		||||
													})}
 | 
			
		||||
@ -677,7 +688,7 @@
 | 
			
		||||
								<div class=" self-center text-xs font-medium">{$i18n.t('Top K')}</div>
 | 
			
		||||
								<div class="flex items-center relative">
 | 
			
		||||
									<input
 | 
			
		||||
										class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
										class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
										type="number"
 | 
			
		||||
										placeholder={$i18n.t('Enter Top K')}
 | 
			
		||||
										bind:value={RAGConfig.TOP_K}
 | 
			
		||||
@ -692,7 +703,7 @@
 | 
			
		||||
									<div class="self-center text-xs font-medium">{$i18n.t('Top K Reranker')}</div>
 | 
			
		||||
									<div class="flex items-center relative">
 | 
			
		||||
										<input
 | 
			
		||||
											class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
											class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
											type="number"
 | 
			
		||||
											placeholder={$i18n.t('Enter Top K Reranker')}
 | 
			
		||||
											bind:value={RAGConfig.TOP_K_RERANKER}
 | 
			
		||||
@ -711,7 +722,7 @@
 | 
			
		||||
										</div>
 | 
			
		||||
										<div class="flex items-center relative">
 | 
			
		||||
											<input
 | 
			
		||||
												class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
												class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
												type="number"
 | 
			
		||||
												step="0.01"
 | 
			
		||||
												placeholder={$i18n.t('Enter Score')}
 | 
			
		||||
@ -770,7 +781,7 @@
 | 
			
		||||
								placement="top-start"
 | 
			
		||||
							>
 | 
			
		||||
								<input
 | 
			
		||||
									class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
									class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
									type="number"
 | 
			
		||||
									placeholder={$i18n.t('Leave empty for unlimited')}
 | 
			
		||||
									bind:value={RAGConfig.FILE_MAX_SIZE}
 | 
			
		||||
@ -791,7 +802,7 @@
 | 
			
		||||
								placement="top-start"
 | 
			
		||||
							>
 | 
			
		||||
								<input
 | 
			
		||||
									class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
 | 
			
		||||
									class="flex-1 w-full text-sm bg-transparent outline-hidden"
 | 
			
		||||
									type="number"
 | 
			
		||||
									placeholder={$i18n.t('Leave empty for unlimited')}
 | 
			
		||||
									bind:value={RAGConfig.FILE_MAX_COUNT}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user