mirror of
https://github.com/open-webui/open-webui
synced 2025-06-25 17:57:20 +00:00
Merge pull request #11464 from FabioPolito24/docling_context_extraction_engine
feat: Docling context extraction engine
This commit is contained in:
commit
f8ac44cfbd
@ -1654,6 +1654,12 @@ TIKA_SERVER_URL = PersistentConfig(
|
|||||||
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
|
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
|
||||||
)
|
)
|
||||||
|
|
||||||
|
DOCLING_SERVER_URL = PersistentConfig(
|
||||||
|
"DOCLING_SERVER_URL",
|
||||||
|
"rag.docling_server_url",
|
||||||
|
os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
|
||||||
|
)
|
||||||
|
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
|
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
|
||||||
"DOCUMENT_INTELLIGENCE_ENDPOINT",
|
"DOCUMENT_INTELLIGENCE_ENDPOINT",
|
||||||
"rag.document_intelligence_endpoint",
|
"rag.document_intelligence_endpoint",
|
||||||
|
@ -186,6 +186,7 @@ from open_webui.config import (
|
|||||||
CHUNK_SIZE,
|
CHUNK_SIZE,
|
||||||
CONTENT_EXTRACTION_ENGINE,
|
CONTENT_EXTRACTION_ENGINE,
|
||||||
TIKA_SERVER_URL,
|
TIKA_SERVER_URL,
|
||||||
|
DOCLING_SERVER_URL,
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
DOCUMENT_INTELLIGENCE_KEY,
|
DOCUMENT_INTELLIGENCE_KEY,
|
||||||
RAG_TOP_K,
|
RAG_TOP_K,
|
||||||
@ -551,6 +552,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
|
|||||||
|
|
||||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||||
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
||||||
|
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
|
||||||
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
||||||
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
||||||
|
|
||||||
|
@ -117,6 +117,52 @@ class TikaLoader:
|
|||||||
raise Exception(f"Error calling Tika: {r.reason}")
|
raise Exception(f"Error calling Tika: {r.reason}")
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingLoader:
|
||||||
|
def __init__(self, url, file_path=None, mime_type=None):
|
||||||
|
self.url = url.rstrip("/")
|
||||||
|
self.file_path = file_path
|
||||||
|
self.mime_type = mime_type
|
||||||
|
|
||||||
|
def load(self) -> list[Document]:
|
||||||
|
with open(self.file_path, "rb") as f:
|
||||||
|
files = {
|
||||||
|
"files": (
|
||||||
|
self.file_path,
|
||||||
|
f,
|
||||||
|
self.mime_type or "application/octet-stream",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"image_export_mode": "placeholder",
|
||||||
|
"table_mode": "accurate",
|
||||||
|
}
|
||||||
|
|
||||||
|
endpoint = f"{self.url}/v1alpha/convert/file"
|
||||||
|
r = requests.post(endpoint, files=files, data=params)
|
||||||
|
|
||||||
|
if r.ok:
|
||||||
|
result = r.json()
|
||||||
|
document_data = result.get("document", {})
|
||||||
|
text = document_data.get("md_content", "<No text content found>")
|
||||||
|
|
||||||
|
metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
|
||||||
|
|
||||||
|
log.debug("Docling extracted text: %s", text)
|
||||||
|
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
||||||
|
else:
|
||||||
|
error_msg = f"Error calling Docling API: {r.reason}"
|
||||||
|
if r.text:
|
||||||
|
try:
|
||||||
|
error_data = r.json()
|
||||||
|
if "detail" in error_data:
|
||||||
|
error_msg += f" - {error_data['detail']}"
|
||||||
|
except Exception:
|
||||||
|
error_msg += f" - {r.text}"
|
||||||
|
raise Exception(f"Error calling Docling: {error_msg}")
|
||||||
|
|
||||||
|
|
||||||
class Loader:
|
class Loader:
|
||||||
def __init__(self, engine: str = "", **kwargs):
|
def __init__(self, engine: str = "", **kwargs):
|
||||||
self.engine = engine
|
self.engine = engine
|
||||||
@ -149,6 +195,12 @@ class Loader:
|
|||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
mime_type=file_content_type,
|
mime_type=file_content_type,
|
||||||
)
|
)
|
||||||
|
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
|
||||||
|
loader = DoclingLoader(
|
||||||
|
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||||
|
file_path=file_path,
|
||||||
|
mime_type=file_content_type,
|
||||||
|
)
|
||||||
elif (
|
elif (
|
||||||
self.engine == "document_intelligence"
|
self.engine == "document_intelligence"
|
||||||
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
||||||
|
@ -358,6 +358,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|||||||
"content_extraction": {
|
"content_extraction": {
|
||||||
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
||||||
|
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
|
||||||
"document_intelligence_config": {
|
"document_intelligence_config": {
|
||||||
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
@ -428,6 +429,7 @@ class DocumentIntelligenceConfigForm(BaseModel):
|
|||||||
class ContentExtractionConfig(BaseModel):
|
class ContentExtractionConfig(BaseModel):
|
||||||
engine: str = ""
|
engine: str = ""
|
||||||
tika_server_url: Optional[str] = None
|
tika_server_url: Optional[str] = None
|
||||||
|
docling_server_url: Optional[str] = None
|
||||||
document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
|
document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
|
||||||
|
|
||||||
|
|
||||||
@ -540,6 +542,9 @@ async def update_rag_config(
|
|||||||
request.app.state.config.TIKA_SERVER_URL = (
|
request.app.state.config.TIKA_SERVER_URL = (
|
||||||
form_data.content_extraction.tika_server_url
|
form_data.content_extraction.tika_server_url
|
||||||
)
|
)
|
||||||
|
request.app.state.config.DOCLING_SERVER_URL = (
|
||||||
|
form_data.content_extraction.docling_server_url
|
||||||
|
)
|
||||||
if form_data.content_extraction.document_intelligence_config is not None:
|
if form_data.content_extraction.document_intelligence_config is not None:
|
||||||
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
|
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
|
||||||
form_data.content_extraction.document_intelligence_config.endpoint
|
form_data.content_extraction.document_intelligence_config.endpoint
|
||||||
@ -648,6 +653,7 @@ async def update_rag_config(
|
|||||||
"content_extraction": {
|
"content_extraction": {
|
||||||
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
||||||
|
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
|
||||||
"document_intelligence_config": {
|
"document_intelligence_config": {
|
||||||
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
@ -990,6 +996,7 @@ def process_file(
|
|||||||
loader = Loader(
|
loader = Loader(
|
||||||
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
||||||
|
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
|
||||||
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
@ -49,6 +49,8 @@
|
|||||||
let contentExtractionEngine = 'default';
|
let contentExtractionEngine = 'default';
|
||||||
let tikaServerUrl = '';
|
let tikaServerUrl = '';
|
||||||
let showTikaServerUrl = false;
|
let showTikaServerUrl = false;
|
||||||
|
let doclingServerUrl = '';
|
||||||
|
let showDoclingServerUrl = false;
|
||||||
let documentIntelligenceEndpoint = '';
|
let documentIntelligenceEndpoint = '';
|
||||||
let documentIntelligenceKey = '';
|
let documentIntelligenceKey = '';
|
||||||
let showDocumentIntelligenceConfig = false;
|
let showDocumentIntelligenceConfig = false;
|
||||||
@ -175,6 +177,10 @@
|
|||||||
toast.error($i18n.t('Tika Server URL required.'));
|
toast.error($i18n.t('Tika Server URL required.'));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (contentExtractionEngine === 'docling' && doclingServerUrl === '') {
|
||||||
|
toast.error($i18n.t('Docling Server URL required.'));
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (
|
if (
|
||||||
contentExtractionEngine === 'document_intelligence' &&
|
contentExtractionEngine === 'document_intelligence' &&
|
||||||
(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
|
(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
|
||||||
@ -209,6 +215,7 @@
|
|||||||
content_extraction: {
|
content_extraction: {
|
||||||
engine: contentExtractionEngine,
|
engine: contentExtractionEngine,
|
||||||
tika_server_url: tikaServerUrl,
|
tika_server_url: tikaServerUrl,
|
||||||
|
docling_server_url: doclingServerUrl,
|
||||||
document_intelligence_config: {
|
document_intelligence_config: {
|
||||||
key: documentIntelligenceKey,
|
key: documentIntelligenceKey,
|
||||||
endpoint: documentIntelligenceEndpoint
|
endpoint: documentIntelligenceEndpoint
|
||||||
@ -269,7 +276,10 @@
|
|||||||
|
|
||||||
contentExtractionEngine = res.content_extraction.engine;
|
contentExtractionEngine = res.content_extraction.engine;
|
||||||
tikaServerUrl = res.content_extraction.tika_server_url;
|
tikaServerUrl = res.content_extraction.tika_server_url;
|
||||||
|
doclingServerUrl = res.content_extraction.docling_server_url;
|
||||||
|
|
||||||
showTikaServerUrl = contentExtractionEngine === 'tika';
|
showTikaServerUrl = contentExtractionEngine === 'tika';
|
||||||
|
showDoclingServerUrl = contentExtractionEngine === 'docling';
|
||||||
documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
|
documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
|
||||||
documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
|
documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
|
||||||
showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
|
showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
|
||||||
@ -337,6 +347,7 @@
|
|||||||
>
|
>
|
||||||
<option value="">{$i18n.t('Default')} </option>
|
<option value="">{$i18n.t('Default')} </option>
|
||||||
<option value="tika">{$i18n.t('Tika')}</option>
|
<option value="tika">{$i18n.t('Tika')}</option>
|
||||||
|
<option value="docling">{ $i18n.t('Docling') }</option>
|
||||||
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
|
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
@ -351,6 +362,14 @@
|
|||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{:else if contentExtractionEngine === 'docling'}
|
||||||
|
<div class="flex w-full mt-1">
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden"
|
||||||
|
placeholder={$i18n.t('Enter Docling Server URL')}
|
||||||
|
bind:value={doclingServerUrl}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
{:else if contentExtractionEngine === 'document_intelligence'}
|
{:else if contentExtractionEngine === 'document_intelligence'}
|
||||||
<div class="my-0.5 flex gap-2 pr-2">
|
<div class="my-0.5 flex gap-2 pr-2">
|
||||||
<input
|
<input
|
||||||
|
Loading…
Reference in New Issue
Block a user