refac: docling picture describe params

This commit is contained in:
Timothy Jaeryang Baek 2025-06-08 20:02:14 +04:00
parent b9c64d0936
commit 0cd400f5ee
5 changed files with 144 additions and 279 deletions

View File

@ -1264,9 +1264,7 @@ def validate_cors_origin(origin):
# To test CORS_ALLOW_ORIGIN locally, you can set something like # To test CORS_ALLOW_ORIGIN locally, you can set something like
# CORS_ALLOW_ORIGIN=http://localhost:5173;http://localhost:8080 # CORS_ALLOW_ORIGIN=http://localhost:5173;http://localhost:8080
# in your .env file depending on your frontend port, 5173 in this case. # in your .env file depending on your frontend port, 5173 in this case.
CORS_ALLOW_ORIGIN = os.environ.get( CORS_ALLOW_ORIGIN = os.environ.get("CORS_ALLOW_ORIGIN", "*").split(";")
"CORS_ALLOW_ORIGIN", "*"
).split(";")
if CORS_ALLOW_ORIGIN == ["*"]: if CORS_ALLOW_ORIGIN == ["*"]:
log.warning( log.warning(
@ -1278,6 +1276,7 @@ else:
for origin in CORS_ALLOW_ORIGIN: for origin in CORS_ALLOW_ORIGIN:
validate_cors_origin(origin) validate_cors_origin(origin)
class BannerModel(BaseModel): class BannerModel(BaseModel):
id: str id: str
type: str type: str
@ -1974,47 +1973,33 @@ DOCLING_PICTURE_DESCRIPTION_MODE = PersistentConfig(
os.getenv("DOCLING_PICTURE_DESCRIPTION_MODE", ""), os.getenv("DOCLING_PICTURE_DESCRIPTION_MODE", ""),
) )
DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID", docling_picture_description_local = os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL", "")
"rag.docling_picture_description_local_repo_id", try:
os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID", "HuggingFaceTB/SmolVLM-256M-Instruct"), docling_picture_description_local = json.loads(docling_picture_description_local)
except json.JSONDecodeError:
docling_picture_description_local = {}
DOCLING_PICTURE_DESCRIPTION_LOCAL = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_LOCAL",
"rag.docling_picture_description_local",
docling_picture_description_local,
) )
DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = PersistentConfig( doclign_picture_description_api = os.getenv("DOCLING_PICTURE_DESCRIPTION_API", "")
"DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS", try:
"rag.docling_picture_description_local_max_tokens", doclign_picture_description_api = json.loads(doclign_picture_description_api)
int(os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS", 200)), except json.JSONDecodeError:
doclign_picture_description_api = {}
DOCLING_PICTURE_DESCRIPTION_API = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_API",
"rag.docling_picture_description_api",
doclign_picture_description_api,
) )
DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT",
"rag.docling_picture_description_local_prompt",
os.getenv(
"DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT",
"Describe this image in a few sentences.",
)
)
DOCLING_PICTURE_DESCRIPTION_API_URL = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_API_URL",
"rag.docling_picture_description_api_url",
os.getenv("DOCLING_PICTURE_DESCRIPTION_API_URL", ""),
)
DOCLING_PICTURE_DESCRIPTION_API_MODEL = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_API_MODEL",
"rag.docling_picture_description_api_model",
os.getenv("DOCLING_PICTURE_DESCRIPTION_API_MODEL", ""),
)
DOCLING_PICTURE_DESCRIPTION_API_PROMPT = PersistentConfig(
"DOCLING_PICTURE_DESCRIPTION_API_PROMPT",
"rag.docling_picture_description_api_prompt",
os.getenv(
"DOCLING_PICTURE_DESCRIPTION_API_PROMPT",
"Describe this image in a few sentences.",
)
)
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
"DOCUMENT_INTELLIGENCE_ENDPOINT", "DOCUMENT_INTELLIGENCE_ENDPOINT",

View File

@ -232,12 +232,8 @@ from open_webui.config import (
DOCLING_OCR_LANG, DOCLING_OCR_LANG,
DOCLING_DO_PICTURE_DESCRIPTION, DOCLING_DO_PICTURE_DESCRIPTION,
DOCLING_PICTURE_DESCRIPTION_MODE, DOCLING_PICTURE_DESCRIPTION_MODE,
DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, DOCLING_PICTURE_DESCRIPTION_LOCAL,
DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, DOCLING_PICTURE_DESCRIPTION_API,
DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
DOCLING_PICTURE_DESCRIPTION_API_URL,
DOCLING_PICTURE_DESCRIPTION_API_MODEL,
DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY, DOCUMENT_INTELLIGENCE_KEY,
MISTRAL_OCR_API_KEY, MISTRAL_OCR_API_KEY,
@ -709,12 +705,8 @@ app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE
app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG
app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION
app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE = DOCLING_PICTURE_DESCRIPTION_MODE app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE = DOCLING_PICTURE_DESCRIPTION_MODE
app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL = DOCLING_PICTURE_DESCRIPTION_LOCAL
app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_API
app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT
app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL = DOCLING_PICTURE_DESCRIPTION_API_URL
app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL = DOCLING_PICTURE_DESCRIPTION_API_MODEL
app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT = DOCLING_PICTURE_DESCRIPTION_API_PROMPT
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY

View File

@ -155,40 +155,23 @@ class DoclingLoader:
"do_picture_description" "do_picture_description"
) )
picture_description_mode = self.params.get("picture_description_mode", "").lower() picture_description_mode = self.params.get(
"picture_description_mode", ""
).lower()
if picture_description_mode == "local": if picture_description_mode == "local" and self.params.get(
"picture_description_local", {}
):
params["picture_description_local"] = self.params.get(
"picture_description_local", {}
)
params["picture_description_local"] = json.dumps({ elif picture_description_mode == "api" and self.params.get(
"repo_id": self.params.get( "picture_description_api", {}
"picture_description_local_repo_id", "HuggingFaceTB/SmolVLM-256M-Instruct" ):
), params["picture_description_api"] = self.params.get(
"generation_config": { "picture_description_api", {}
"max_new_tokens": self.params.get( )
"picture_description_local_max_tokens", 200
)
},
"prompt": self.params.get(
"picture_description_local_prompt", "Describe this image in a few sentences."
)
})
elif picture_description_mode == "api":
params["picture_description_api"] = json.dumps({
"url": self.params.get(
"picture_description_api_url", ""
),
"params": {
"model": self.params.get(
"picture_description_api_model", ""
)
},
"timeout": 30,
"prompt": self.params.get(
"picture_description_api_prompt", "Describe this image in a few sentences."
)
})
if self.params.get("ocr_engine") and self.params.get("ocr_lang"): if self.params.get("ocr_engine") and self.params.get("ocr_lang"):
params["ocr_engine"] = self.params.get("ocr_engine") params["ocr_engine"] = self.params.get("ocr_engine")
@ -318,24 +301,19 @@ class Loader:
loader = TextLoader(file_path, autodetect_encoding=True) loader = TextLoader(file_path, autodetect_encoding=True)
else: else:
# Build params for DoclingLoader # Build params for DoclingLoader
params = { params = self.kwargs.get("DOCLING_PARAMS", {})
"ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"), if not isinstance(params, dict):
"ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"), try:
"do_picture_description": self.kwargs.get("DOCLING_DO_PICTURE_DESCRIPTION"), params = json.loads(params)
"picture_description_mode": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_MODE"), except json.JSONDecodeError:
"picture_description_local_repo_id": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID"), log.error("Invalid DOCLING_PARAMS format, expected JSON object")
"picture_description_local_max_tokens": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS"), params = {}
"picture_description_local_prompt": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT"),
"picture_description_api_url": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_URL"),
"picture_description_api_model": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_MODEL"),
"picture_description_api_prompt": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_PROMPT")
}
loader = DoclingLoader( loader = DoclingLoader(
url=self.kwargs.get("DOCLING_SERVER_URL"), url=self.kwargs.get("DOCLING_SERVER_URL"),
file_path=file_path, file_path=file_path,
mime_type=file_content_type, mime_type=file_content_type,
params=params params=params,
) )
elif ( elif (
self.engine == "document_intelligence" self.engine == "document_intelligence"

View File

@ -415,12 +415,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
"DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
"DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE, "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
"DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, "DOCLING_PICTURE_DESCRIPTION_LOCAL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
"DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, "DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
"DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
"DOCLING_PICTURE_DESCRIPTION_API_URL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL,
"DOCLING_PICTURE_DESCRIPTION_API_MODEL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL,
"DOCLING_PICTURE_DESCRIPTION_API_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
@ -583,12 +579,8 @@ class ConfigForm(BaseModel):
DOCLING_OCR_LANG: Optional[str] = None DOCLING_OCR_LANG: Optional[str] = None
DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None
DOCLING_PICTURE_DESCRIPTION_MODE: Optional[str] = None DOCLING_PICTURE_DESCRIPTION_MODE: Optional[str] = None
DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID: Optional[str] = None DOCLING_PICTURE_DESCRIPTION_LOCAL: Optional[dict] = None
DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS: Optional[int] = None DOCLING_PICTURE_DESCRIPTION_API: Optional[dict] = None
DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT: Optional[str] = None
DOCLING_PICTURE_DESCRIPTION_API_URL: Optional[str] = None
DOCLING_PICTURE_DESCRIPTION_API_MODEL: Optional[str] = None
DOCLING_PICTURE_DESCRIPTION_API_PROMPT: Optional[str] = None
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
MISTRAL_OCR_API_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None
@ -767,35 +759,15 @@ async def update_rag_config(
if form_data.DOCLING_PICTURE_DESCRIPTION_MODE is not None if form_data.DOCLING_PICTURE_DESCRIPTION_MODE is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE
) )
request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = ( request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL = (
form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL
if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID is not None if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL
) )
request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = ( request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API = (
form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS form_data.DOCLING_PICTURE_DESCRIPTION_API
if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS is not None if form_data.DOCLING_PICTURE_DESCRIPTION_API is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API
)
request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = (
form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT
if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT
)
request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL = (
form_data.DOCLING_PICTURE_DESCRIPTION_API_URL
if form_data.DOCLING_PICTURE_DESCRIPTION_API_URL is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL
)
request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL = (
form_data.DOCLING_PICTURE_DESCRIPTION_API_MODEL
if form_data.DOCLING_PICTURE_DESCRIPTION_API_MODEL is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL
)
request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT = (
form_data.DOCLING_PICTURE_DESCRIPTION_API_PROMPT
if form_data.DOCLING_PICTURE_DESCRIPTION_API_PROMPT is not None
else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT
) )
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
@ -1036,12 +1008,8 @@ async def update_rag_config(
"DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
"DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
"DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE, "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
"DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, "DOCLING_PICTURE_DESCRIPTION_LOCAL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
"DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, "DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
"DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
"DOCLING_PICTURE_DESCRIPTION_API_URL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL,
"DOCLING_PICTURE_DESCRIPTION_API_MODEL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL,
"DOCLING_PICTURE_DESCRIPTION_API_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
@ -1388,16 +1356,14 @@ def process_file(
EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE, DOCLING_PARAMS={
DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG, "ocr_engine": request.app.state.config.DOCLING_OCR_ENGINE,
DOCLING_DO_PICTURE_DESCRIPTION=request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, "ocr_lang": request.app.state.config.DOCLING_OCR_LANG,
DOCLING_PICTURE_DESCRIPTION_MODE=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE, "do_picture_description": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID, "picture_description_mode": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS, "picture_description_local": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT, "picture_description_api": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
DOCLING_PICTURE_DESCRIPTION_API_URL=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL, },
DOCLING_PICTURE_DESCRIPTION_API_MODEL=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL,
DOCLING_PICTURE_DESCRIPTION_API_PROMPT=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,

View File

@ -194,17 +194,20 @@
await embeddingModelUpdateHandler(); await embeddingModelUpdateHandler();
} }
RAGConfig.ALLOWED_FILE_EXTENSIONS = (RAGConfig?.ALLOWED_FILE_EXTENSIONS ?? '') const res = await updateRAGConfig(localStorage.token, {
.split(',') ...RAGConfig,
.map((ext) => ext.trim()) ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',')
.filter((ext) => ext !== ''); .map((ext) => ext.trim())
.filter((ext) => ext !== ''),
RAGConfig.DATALAB_MARKER_LANGS = RAGConfig.DATALAB_MARKER_LANGS.split(',') DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',')
.map((code) => code.trim()) .map((code) => code.trim())
.filter((code) => code !== '') .filter((code) => code !== '')
.join(', '); .join(', '),
DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse(
const res = await updateRAGConfig(localStorage.token, RAGConfig); RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}'
),
DOCLING_PICTURE_DESCRIPTION_API: JSON.parse(RAGConfig.DOCLING_PICTURE_DESCRIPTION_API || '{}')
});
dispatch('save'); dispatch('save');
}; };
@ -232,6 +235,18 @@
const config = await getRAGConfig(localStorage.token); const config = await getRAGConfig(localStorage.token);
config.ALLOWED_FILE_EXTENSIONS = (config?.ALLOWED_FILE_EXTENSIONS ?? []).join(', '); config.ALLOWED_FILE_EXTENSIONS = (config?.ALLOWED_FILE_EXTENSIONS ?? []).join(', ');
config.DOCLING_PICTURE_DESCRIPTION_LOCAL = JSON.stringify(
config.DOCLING_PICTURE_DESCRIPTION_LOCAL ?? {},
null,
2
);
config.DOCLING_PICTURE_DESCRIPTION_API = JSON.stringify(
config.DOCLING_PICTURE_DESCRIPTION_API ?? {},
null,
2
);
RAGConfig = config; RAGConfig = config;
}); });
</script> </script>
@ -511,135 +526,66 @@
</div> </div>
</div> </div>
{#if RAGConfig.DOCLING_DO_PICTURE_DESCRIPTION} {#if RAGConfig.DOCLING_DO_PICTURE_DESCRIPTION}
<div class="flex w-full mt-2"> <div class="flex justify-between w-full mt-2">
<div class="flex-1 flex items-center gap-4"> <div class="self-center text-xs font-medium">
<label class="flex items-center gap-1 text-xs font-medium"> <Tooltip content={''} placement="top-start">
<Tooltip {$i18n.t('Picture Description Mode')}
content={$i18n.t('Use a model locally executed by Docling for picture description.')} </Tooltip>
placement="top-start" </div>
> <div class="">
<input <select
type="radio" class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden text-right"
name="picture-description-mode" bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE}
value="local" >
bind:group={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE} <option value="">{$i18n.t('Default')}</option>
checked={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'local'} <option value="local">{$i18n.t('Local')}</option>
/> <option value="api">{$i18n.t('API')}</option>
<span style="padding-left: 0.5em">{$i18n.t('Local Description')}</span> </select>
</Tooltip>
</label>
<label class="flex items-center gap-1 text-xs font-medium">
<Tooltip
content={$i18n.t('Use a remote API for picture description.')}
placement="top-start"
>
<input
type="radio"
name="picture-description-mode"
value="api"
bind:group={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE}
checked={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'api'}
/>
<span style="padding-left: 0.5em">{$i18n.t('Remote Description')}</span>
</Tooltip>
</label>
</div> </div>
</div> </div>
{#if RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'local'} {#if RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'local'}
<div class="flex flex-col gap-2 mt-2 ml-4"> <div class="flex flex-col gap-2 mt-2">
<div class="flex items-center gap-2"> <div class=" flex flex-col w-full justify-between">
<div class="min-w-fit text-xs font-medium"> <div class=" mb-1 text-xs font-medium">
{$i18n.t('Picture Description Local Config')}
</div>
<div class="flex w-full items-center relative">
<Tooltip <Tooltip
content={$i18n.t('The HuggingFace repo ID for the local vision-language model.')} content={$i18n.t(
'Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.'
)}
placement="top-start" placement="top-start"
className="w-full"
> >
{$i18n.t('Repo ID')} <Textarea
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL}
placeholder={$i18n.t('Enter Options in JSON format')}
/>
</Tooltip> </Tooltip>
</div> </div>
<input
class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('HuggingFaceTB/SmolVLM-256M-Instruct')}
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID}
/>
</div>
<div class="flex items-center gap-2">
<div class="min-w-fit text-xs font-medium">
<Tooltip
content={$i18n.t('Maximum number of tokens for the generated description.')}
placement="top-start"
>
{$i18n.t('Max Tokens')}
</Tooltip>
</div>
<input
class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('200')}
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS}
/>
</div>
<div class="flex items-center gap-2">
<div class="min-w-fit text-xs font-medium">
<Tooltip
content={$i18n.t('Prompt to use for describing the image.')}
placement="top-start"
>
{$i18n.t('Prompt')}
</Tooltip>
</div>
<input
class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('Describe this image in a few sentences.')}
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT}
/>
</div> </div>
</div> </div>
{:else if RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'api'} {:else if RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'api'}
<div class="flex flex-col gap-2 mt-2 ml-4"> <div class="flex flex-col gap-2 mt-2">
<div class="flex items-center gap-2"> <div class=" flex flex-col w-full justify-between">
<div class="min-w-fit text-xs font-medium"> <div class=" mb-1 text-xs font-medium">
{$i18n.t('Picture Description API Config')}
</div>
<div class="flex w-full items-center relative">
<Tooltip <Tooltip
content={$i18n.t('The remote API endpoint for picture description.')} content={$i18n.t(
'API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.'
)}
placement="top-start" placement="top-start"
className="w-full"
> >
{$i18n.t('URL')} <Textarea
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API}
placeholder={$i18n.t('Enter Options in JSON format')}
/>
</Tooltip> </Tooltip>
</div> </div>
<input
class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('Enter Remote API URL')}
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API_URL}
/>
</div>
<div class="flex items-center gap-2">
<div class="min-w-fit text-xs font-medium">
<Tooltip
content={$i18n.t('The model name to use for remote picture description.')}
placement="top-start"
>
{$i18n.t('Model')}
</Tooltip>
</div>
<input
class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('Enter Model Name')}
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API_MODEL}
/>
</div>
<div class="flex items-center gap-2">
<div class="min-w-fit text-xs font-medium">
<Tooltip
content={$i18n.t('Prompt to use for describing the image via remote API.')}
placement="top-start"
>
{$i18n.t('Prompt')}
</Tooltip>
</div>
<input
class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
placeholder={$i18n.t('Describe this image in a few sentences.')}
bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API_PROMPT}
/>
</div> </div>
</div> </div>
{/if} {/if}
@ -964,9 +910,7 @@
<div class=" mb-2.5 flex w-full justify-between"> <div class=" mb-2.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">{$i18n.t('Hybrid Search')}</div> <div class=" self-center text-xs font-medium">{$i18n.t('Hybrid Search')}</div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<Switch <Switch bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH} />
bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH}
/>
</div> </div>
</div> </div>