mirror of
https://github.com/open-webui/open-webui
synced 2025-06-26 18:26:48 +00:00
feat: docling support for document preprocessing
This commit is contained in:
@@ -1378,6 +1378,12 @@ TIKA_SERVER_URL = PersistentConfig(
|
||||
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
|
||||
)
|
||||
|
||||
DOCLING_SERVER_URL = PersistentConfig(
|
||||
"DOCLING_SERVER_URL",
|
||||
"rag.docling_server_url",
|
||||
os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
|
||||
)
|
||||
|
||||
RAG_TOP_K = PersistentConfig(
|
||||
"RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
|
||||
)
|
||||
|
||||
@@ -154,6 +154,7 @@ from open_webui.config import (
|
||||
CHUNK_SIZE,
|
||||
CONTENT_EXTRACTION_ENGINE,
|
||||
TIKA_SERVER_URL,
|
||||
DOCLING_SERVER_URL,
|
||||
RAG_TOP_K,
|
||||
RAG_TEXT_SPLITTER,
|
||||
TIKTOKEN_ENCODING_NAME,
|
||||
@@ -477,6 +478,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
|
||||
|
||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
||||
app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL
|
||||
|
||||
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
|
||||
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
||||
|
||||
@@ -115,6 +115,61 @@ class TikaLoader:
|
||||
raise Exception(f"Error calling Tika: {r.reason}")
|
||||
|
||||
|
||||
class DoclingLoader:
|
||||
def __init__(self, url, file_path=None, mime_type=None):
|
||||
self.url = url.rstrip("/") # Ensure no trailing slash
|
||||
self.file_path = file_path
|
||||
self.mime_type = mime_type
|
||||
|
||||
def load(self) -> list[Document]:
|
||||
if self.file_path is None:
|
||||
raise ValueError("File path is required for DoclingLoader")
|
||||
|
||||
with open(self.file_path, "rb") as f:
|
||||
files = {"files": (self.file_path, f, self.mime_type or "application/octet-stream")}
|
||||
|
||||
params = {
|
||||
"from_formats": ["docx", "pptx", "html", "xml_pubmed", "image", "pdf", "asciidoc", "md", "xlsx", "xml_uspto", "json_docling"],
|
||||
"to_formats": ["md"],
|
||||
"image_export_mode": "placeholder",
|
||||
"do_ocr": True,
|
||||
"force_ocr": False,
|
||||
"ocr_engine": "easyocr",
|
||||
"ocr_lang": None,
|
||||
"pdf_backend": "dlparse_v2",
|
||||
"table_mode": "fast",
|
||||
"abort_on_error": False,
|
||||
"return_as_file": False,
|
||||
"do_table_structure": True,
|
||||
"include_images": True,
|
||||
"images_scale": 2.0,
|
||||
}
|
||||
|
||||
endpoint = f"{self.url}/v1alpha/convert/file"
|
||||
response = requests.post(endpoint, files=files, data=params)
|
||||
|
||||
if response.ok:
|
||||
result = response.json()
|
||||
document_data = result.get("document", {})
|
||||
text = document_data.get("md_content", "<No text content found>")
|
||||
|
||||
metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
|
||||
|
||||
log.debug("Docling extracted text: %s", text)
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
else:
|
||||
error_msg = f"Error calling Docling API: {response.status_code}"
|
||||
if response.text:
|
||||
try:
|
||||
error_data = response.json()
|
||||
if "detail" in error_data:
|
||||
error_msg += f" - {error_data['detail']}"
|
||||
except:
|
||||
error_msg += f" - {response.text}"
|
||||
raise Exception(f"Error calling Docling: {error_msg}")
|
||||
|
||||
|
||||
class Loader:
|
||||
def __init__(self, engine: str = "", **kwargs):
|
||||
self.engine = engine
|
||||
@@ -147,6 +202,12 @@ class Loader:
|
||||
file_path=file_path,
|
||||
mime_type=file_content_type,
|
||||
)
|
||||
elif self.engine == "docling":
|
||||
loader = DoclingLoader(
|
||||
url=self.kwargs.get("DOCLING_SERVER_URL"),
|
||||
file_path=file_path,
|
||||
mime_type=file_content_type,
|
||||
)
|
||||
else:
|
||||
if file_ext == "pdf":
|
||||
loader = PyPDFLoader(
|
||||
|
||||
@@ -351,6 +351,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||
"content_extraction": {
|
||||
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
||||
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
|
||||
},
|
||||
"chunk": {
|
||||
"text_splitter": request.app.state.config.TEXT_SPLITTER,
|
||||
@@ -403,6 +404,7 @@ class FileConfig(BaseModel):
|
||||
class ContentExtractionConfig(BaseModel):
|
||||
engine: str = ""
|
||||
tika_server_url: Optional[str] = None
|
||||
docling_server_url: Optional[str] = None
|
||||
|
||||
|
||||
class ChunkParamUpdateForm(BaseModel):
|
||||
@@ -483,6 +485,9 @@ async def update_rag_config(
|
||||
request.app.state.config.TIKA_SERVER_URL = (
|
||||
form_data.content_extraction.tika_server_url
|
||||
)
|
||||
request.app.state.config.DOCLING_SERVER_URL = (
|
||||
form_data.content_extraction.docling_server_url
|
||||
)
|
||||
|
||||
if form_data.chunk is not None:
|
||||
request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
|
||||
@@ -559,6 +564,7 @@ async def update_rag_config(
|
||||
"content_extraction": {
|
||||
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
||||
"docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
|
||||
},
|
||||
"chunk": {
|
||||
"text_splitter": request.app.state.config.TEXT_SPLITTER,
|
||||
@@ -879,6 +885,7 @@ def process_file(
|
||||
loader = Loader(
|
||||
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
||||
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
|
||||
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
)
|
||||
docs = loader.load(
|
||||
|
||||
Reference in New Issue
Block a user