Merge pull request #11464 from FabioPolito24/docling_context_extraction_engine

feat: Docling context extraction engine
This commit is contained in:
Timothy Jaeryang Baek
2025-03-09 20:57:48 -03:00
committed by GitHub
5 changed files with 86 additions and 0 deletions

View File

@@ -1654,6 +1654,12 @@ TIKA_SERVER_URL = PersistentConfig(
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
)
DOCLING_SERVER_URL = PersistentConfig(
"DOCLING_SERVER_URL",
"rag.docling_server_url",
os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
)
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
"DOCUMENT_INTELLIGENCE_ENDPOINT",
"rag.document_intelligence_endpoint",

View File

@@ -186,6 +186,7 @@ from open_webui.config import (
CHUNK_SIZE,
CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL,
DOCLING_SERVER_URL,
DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY,
RAG_TOP_K,
@@ -551,6 +552,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY

View File

@@ -117,6 +117,52 @@ class TikaLoader:
raise Exception(f"Error calling Tika: {r.reason}")
class DoclingLoader:
def __init__(self, url, file_path=None, mime_type=None):
self.url = url.rstrip("/")
self.file_path = file_path
self.mime_type = mime_type
def load(self) -> list[Document]:
with open(self.file_path, "rb") as f:
files = {
"files": (
self.file_path,
f,
self.mime_type or "application/octet-stream",
)
}
params = {
"image_export_mode": "placeholder",
"table_mode": "accurate",
}
endpoint = f"{self.url}/v1alpha/convert/file"
r = requests.post(endpoint, files=files, data=params)
if r.ok:
result = r.json()
document_data = result.get("document", {})
text = document_data.get("md_content", "<No text content found>")
metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
log.debug("Docling extracted text: %s", text)
return [Document(page_content=text, metadata=metadata)]
else:
error_msg = f"Error calling Docling API: {r.reason}"
if r.text:
try:
error_data = r.json()
if "detail" in error_data:
error_msg += f" - {error_data['detail']}"
except Exception:
error_msg += f" - {r.text}"
raise Exception(f"Error calling Docling: {error_msg}")
class Loader:
def __init__(self, engine: str = "", **kwargs):
self.engine = engine
@@ -149,6 +195,12 @@ class Loader:
file_path=file_path,
mime_type=file_content_type,
)
elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
loader = DoclingLoader(
url=self.kwargs.get("DOCLING_SERVER_URL"),
file_path=file_path,
mime_type=file_content_type,
)
elif (
self.engine == "document_intelligence"
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""

View File

@@ -358,6 +358,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
"document_intelligence_config": {
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
@@ -428,6 +429,7 @@ class DocumentIntelligenceConfigForm(BaseModel):
class ContentExtractionConfig(BaseModel):
engine: str = ""
tika_server_url: Optional[str] = None
docling_server_url: Optional[str] = None
document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
@@ -540,6 +542,9 @@ async def update_rag_config(
request.app.state.config.TIKA_SERVER_URL = (
form_data.content_extraction.tika_server_url
)
request.app.state.config.DOCLING_SERVER_URL = (
form_data.content_extraction.docling_server_url
)
if form_data.content_extraction.document_intelligence_config is not None:
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
form_data.content_extraction.document_intelligence_config.endpoint
@@ -648,6 +653,7 @@ async def update_rag_config(
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
"document_intelligence_config": {
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
@@ -990,6 +996,7 @@ def process_file(
loader = Loader(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,