mirror of
https://github.com/open-webui/open-webui
synced 2025-06-12 01:12:49 +00:00
Merge pull request #13447 from athoik/dev
feat(ocr): add support for Docling OCR engine and language configuration
This commit is contained in:
commit
bfacab5a0c
@ -1839,6 +1839,18 @@ DOCLING_SERVER_URL = PersistentConfig(
|
|||||||
os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
|
os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
DOCLING_OCR_ENGINE = PersistentConfig(
|
||||||
|
"DOCLING_OCR_ENGINE",
|
||||||
|
"rag.docling_ocr_engine",
|
||||||
|
os.getenv("DOCLING_OCR_ENGINE", "tesseract"),
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCLING_OCR_LANG = PersistentConfig(
|
||||||
|
"DOCLING_OCR_LANG",
|
||||||
|
"rag.docling_ocr_lang",
|
||||||
|
os.getenv("DOCLING_OCR_LANG", "eng,fra,deu,spa"),
|
||||||
|
)
|
||||||
|
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
|
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
|
||||||
"DOCUMENT_INTELLIGENCE_ENDPOINT",
|
"DOCUMENT_INTELLIGENCE_ENDPOINT",
|
||||||
"rag.document_intelligence_endpoint",
|
"rag.document_intelligence_endpoint",
|
||||||
|
@ -202,6 +202,8 @@ from open_webui.config import (
|
|||||||
CONTENT_EXTRACTION_ENGINE,
|
CONTENT_EXTRACTION_ENGINE,
|
||||||
TIKA_SERVER_URL,
|
TIKA_SERVER_URL,
|
||||||
DOCLING_SERVER_URL,
|
DOCLING_SERVER_URL,
|
||||||
|
DOCLING_OCR_ENGINE,
|
||||||
|
DOCLING_OCR_LANG,
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
DOCUMENT_INTELLIGENCE_KEY,
|
DOCUMENT_INTELLIGENCE_KEY,
|
||||||
MISTRAL_OCR_API_KEY,
|
MISTRAL_OCR_API_KEY,
|
||||||
@ -635,6 +637,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
|
|||||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||||
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
||||||
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
|
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
|
||||||
|
app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE
|
||||||
|
app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG
|
||||||
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
||||||
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
||||||
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
||||||
|
@ -100,7 +100,7 @@ class TikaLoader:
|
|||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
if self.kwargs.get("PDF_EXTRACT_IMAGES") == True:
|
if self.kwargs.get("PDF_EXTRACT_IMAGES") == True:
|
||||||
headers['X-Tika-PDFextractInlineImages'] = 'true'
|
headers["X-Tika-PDFextractInlineImages"] = "true"
|
||||||
|
|
||||||
endpoint = self.url
|
endpoint = self.url
|
||||||
if not endpoint.endswith("/"):
|
if not endpoint.endswith("/"):
|
||||||
@ -124,10 +124,14 @@ class TikaLoader:
|
|||||||
|
|
||||||
|
|
||||||
class DoclingLoader:
|
class DoclingLoader:
|
||||||
def __init__(self, url, file_path=None, mime_type=None):
|
def __init__(
|
||||||
|
self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None
|
||||||
|
):
|
||||||
self.url = url.rstrip("/")
|
self.url = url.rstrip("/")
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.mime_type = mime_type
|
self.mime_type = mime_type
|
||||||
|
self.ocr_engine = ocr_engine
|
||||||
|
self.ocr_lang = ocr_lang
|
||||||
|
|
||||||
def load(self) -> list[Document]:
|
def load(self) -> list[Document]:
|
||||||
with open(self.file_path, "rb") as f:
|
with open(self.file_path, "rb") as f:
|
||||||
@ -144,6 +148,12 @@ class DoclingLoader:
|
|||||||
"table_mode": "accurate",
|
"table_mode": "accurate",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.ocr_engine and self.ocr_lang:
|
||||||
|
params["ocr_engine"] = self.ocr_engine
|
||||||
|
params["ocr_lang"] = [
|
||||||
|
lang.strip() for lang in self.ocr_lang.split(",") if lang.strip()
|
||||||
|
]
|
||||||
|
|
||||||
endpoint = f"{self.url}/v1alpha/convert/file"
|
endpoint = f"{self.url}/v1alpha/convert/file"
|
||||||
r = requests.post(endpoint, files=files, data=params)
|
r = requests.post(endpoint, files=files, data=params)
|
||||||
|
|
||||||
@ -212,6 +222,8 @@ class Loader:
|
|||||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
mime_type=file_content_type,
|
mime_type=file_content_type,
|
||||||
|
ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"),
|
||||||
|
ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"),
|
||||||
)
|
)
|
||||||
elif (
|
elif (
|
||||||
self.engine == "document_intelligence"
|
self.engine == "document_intelligence"
|
||||||
|
@ -378,6 +378,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|||||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
"TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
|
"TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
|
||||||
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
|
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
|
||||||
|
"DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
|
||||||
|
"DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
|
||||||
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||||
@ -511,6 +513,8 @@ class ConfigForm(BaseModel):
|
|||||||
PDF_EXTRACT_IMAGES: Optional[bool] = None
|
PDF_EXTRACT_IMAGES: Optional[bool] = None
|
||||||
TIKA_SERVER_URL: Optional[str] = None
|
TIKA_SERVER_URL: Optional[str] = None
|
||||||
DOCLING_SERVER_URL: Optional[str] = None
|
DOCLING_SERVER_URL: Optional[str] = None
|
||||||
|
DOCLING_OCR_ENGINE: Optional[str] = None
|
||||||
|
DOCLING_OCR_LANG: Optional[str] = None
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
|
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
|
||||||
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
|
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
|
||||||
MISTRAL_OCR_API_KEY: Optional[str] = None
|
MISTRAL_OCR_API_KEY: Optional[str] = None
|
||||||
@ -600,6 +604,16 @@ async def update_rag_config(
|
|||||||
if form_data.DOCLING_SERVER_URL is not None
|
if form_data.DOCLING_SERVER_URL is not None
|
||||||
else request.app.state.config.DOCLING_SERVER_URL
|
else request.app.state.config.DOCLING_SERVER_URL
|
||||||
)
|
)
|
||||||
|
request.app.state.config.DOCLING_OCR_ENGINE = (
|
||||||
|
form_data.DOCLING_OCR_ENGINE
|
||||||
|
if form_data.DOCLING_OCR_ENGINE is not None
|
||||||
|
else request.app.state.config.DOCLING_OCR_ENGINE
|
||||||
|
)
|
||||||
|
request.app.state.config.DOCLING_OCR_LANG = (
|
||||||
|
form_data.DOCLING_OCR_LANG
|
||||||
|
if form_data.DOCLING_OCR_LANG is not None
|
||||||
|
else request.app.state.config.DOCLING_OCR_LANG
|
||||||
|
)
|
||||||
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
|
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
|
||||||
form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
|
form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
|
||||||
if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
|
if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
|
||||||
@ -767,6 +781,8 @@ async def update_rag_config(
|
|||||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
"TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
|
"TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
|
||||||
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
|
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
|
||||||
|
"DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
|
||||||
|
"DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
|
||||||
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||||
@ -1080,6 +1096,8 @@ def process_file(
|
|||||||
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
||||||
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
|
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
|
||||||
|
DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE,
|
||||||
|
DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG,
|
||||||
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
@ -161,6 +161,12 @@
|
|||||||
toast.error($i18n.t('Docling Server URL required.'));
|
toast.error($i18n.t('Docling Server URL required.'));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling' &&
|
||||||
|
((RAGConfig.DOCLING_OCR_ENGINE === '' && RAGConfig.DOCLING_OCR_LANG !== '') ||
|
||||||
|
(RAGConfig.DOCLING_OCR_ENGINE !== '' && RAGConfig.DOCLING_OCR_LANG === ''))) {
|
||||||
|
toast.error($i18n.t('Both Docling OCR Engine and Language(s) must be provided or both left empty.'));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
|
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
|
||||||
@ -326,6 +332,18 @@
|
|||||||
bind:value={RAGConfig.DOCLING_SERVER_URL}
|
bind:value={RAGConfig.DOCLING_SERVER_URL}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="flex w-full mt-2">
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
|
||||||
|
placeholder={$i18n.t('Enter Docling OCR Engine')}
|
||||||
|
bind:value={RAGConfig.DOCLING_OCR_ENGINE}
|
||||||
|
/>
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
|
||||||
|
placeholder={$i18n.t('Enter Docling OCR Language(s)')}
|
||||||
|
bind:value={RAGConfig.DOCLING_OCR_LANG}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'}
|
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'}
|
||||||
<div class="my-0.5 flex gap-2 pr-2">
|
<div class="my-0.5 flex gap-2 pr-2">
|
||||||
<input
|
<input
|
||||||
|
Loading…
Reference in New Issue
Block a user