mirror of
				https://github.com/open-webui/open-webui
				synced 2025-06-26 18:26:48 +00:00 
			
		
		
		
	Added HTML and Typescript UI components to support configration of text extraction engine.
Updated RAG /config and /config/update endpoints to support UI updates. Fixed .dockerignore to prevent Python venv from being copied into Docker image.
This commit is contained in:
		
							parent
							
								
									9cf622d981
								
							
						
					
					
						commit
						7aa35a3757
					
				| @ -11,6 +11,7 @@ vite.config.js.timestamp-* | ||||
| vite.config.ts.timestamp-* | ||||
| __pycache__ | ||||
| .idea | ||||
| venv | ||||
| _old | ||||
| uploads | ||||
| .ipynb_checkpoints | ||||
|  | ||||
| @ -93,7 +93,7 @@ from config import ( | ||||
|     SRC_LOG_LEVELS, | ||||
|     UPLOAD_DIR, | ||||
|     DOCS_DIR, | ||||
|     DOCUMENT_USE_TIKA, | ||||
|     TEXT_EXTRACTION_ENGINE, | ||||
|     TIKA_SERVER_URL, | ||||
|     RAG_TOP_K, | ||||
|     RAG_RELEVANCE_THRESHOLD, | ||||
| @ -150,6 +150,9 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( | ||||
|     ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION | ||||
| ) | ||||
| 
 | ||||
| app.state.config.TEXT_EXTRACTION_ENGINE = TEXT_EXTRACTION_ENGINE | ||||
| app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL | ||||
| 
 | ||||
| app.state.config.CHUNK_SIZE = CHUNK_SIZE | ||||
| app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP | ||||
| 
 | ||||
| @ -390,6 +393,10 @@ async def get_rag_config(user=Depends(get_admin_user)): | ||||
|     return { | ||||
|         "status": True, | ||||
|         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, | ||||
|         "text_extraction": { | ||||
|             "engine": app.state.config.TEXT_EXTRACTION_ENGINE, | ||||
|             "tika_server_url": app.state.config.TIKA_SERVER_URL, | ||||
|         }, | ||||
|         "chunk": { | ||||
|             "chunk_size": app.state.config.CHUNK_SIZE, | ||||
|             "chunk_overlap": app.state.config.CHUNK_OVERLAP, | ||||
| @ -419,6 +426,11 @@ async def get_rag_config(user=Depends(get_admin_user)): | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| class TextExtractionConfig(BaseModel): | ||||
|     engine: str = "" | ||||
|     tika_server_url: Optional[str] = None | ||||
| 
 | ||||
| 
 | ||||
| class ChunkParamUpdateForm(BaseModel): | ||||
|     chunk_size: int | ||||
|     chunk_overlap: int | ||||
| @ -452,6 +464,7 @@ class WebConfig(BaseModel): | ||||
| 
 | ||||
| class ConfigUpdateForm(BaseModel): | ||||
|     pdf_extract_images: Optional[bool] = None | ||||
|     text_extraction: Optional[TextExtractionConfig] = None | ||||
|     chunk: Optional[ChunkParamUpdateForm] = None | ||||
|     youtube: Optional[YoutubeLoaderConfig] = None | ||||
|     web: Optional[WebConfig] = None | ||||
| @ -465,6 +478,11 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_ | ||||
|         else app.state.config.PDF_EXTRACT_IMAGES | ||||
|     ) | ||||
| 
 | ||||
|     if form_data.text_extraction is not None: | ||||
|         log.info(f"Updating text settings: {form_data.text_extraction}") | ||||
|         app.state.config.TEXT_EXTRACTION_ENGINE = form_data.text_extraction.engine | ||||
|         app.state.config.TIKA_SERVER_URL = form_data.text_extraction.tika_server_url | ||||
| 
 | ||||
|     if form_data.chunk is not None: | ||||
|         app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size | ||||
|         app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap | ||||
| @ -501,6 +519,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_ | ||||
|     return { | ||||
|         "status": True, | ||||
|         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, | ||||
|         "text_extraction": { | ||||
|             "engine": app.state.config.TEXT_EXTRACTION_ENGINE, | ||||
|             "tika_server_url": app.state.config.TIKA_SERVER_URL, | ||||
|         }, | ||||
|         "chunk": { | ||||
|             "chunk_size": app.state.config.CHUNK_SIZE, | ||||
|             "chunk_overlap": app.state.config.CHUNK_OVERLAP, | ||||
| @ -1001,7 +1023,7 @@ class TikaLoader: | ||||
|         else: | ||||
|             headers = {} | ||||
| 
 | ||||
|         endpoint = str(TIKA_SERVER_URL) | ||||
|         endpoint = app.state.config.TIKA_SERVER_URL | ||||
|         if not endpoint.endswith("/"): | ||||
|             endpoint += "/" | ||||
|         endpoint += "tika/text" | ||||
| @ -1072,9 +1094,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): | ||||
|         "msg", | ||||
|     ] | ||||
| 
 | ||||
|     log.warning("Use tika: %s, server URL: %s", DOCUMENT_USE_TIKA, TIKA_SERVER_URL) | ||||
| 
 | ||||
|     if DOCUMENT_USE_TIKA and TIKA_SERVER_URL: | ||||
|     if app.state.config.TEXT_EXTRACTION_ENGINE == "tika" and app.state.config.TIKA_SERVER_URL: | ||||
|         if file_ext in known_source_ext or ( | ||||
|                 file_content_type and file_content_type.find("text/") >= 0 | ||||
|         ): | ||||
|  | ||||
| @ -882,10 +882,10 @@ if WEBUI_AUTH and WEBUI_SECRET_KEY == "": | ||||
| # RAG document text extraction | ||||
| #################################### | ||||
| 
 | ||||
| DOCUMENT_USE_TIKA = PersistentConfig( | ||||
|     "DOCUMENT_USE_TIKA", | ||||
|     "rag.document_use_tika", | ||||
|     os.environ.get("DOCUMENT_USE_TIKA", "false").lower() == "true" | ||||
| TEXT_EXTRACTION_ENGINE = PersistentConfig( | ||||
|     "TEXT_EXTRACTION_ENGINE", | ||||
|     "rag.text_extraction_engine", | ||||
|     os.environ.get("TEXT_EXTRACTION_ENGINE", "").lower() | ||||
| ) | ||||
| 
 | ||||
| TIKA_SERVER_URL = PersistentConfig( | ||||
|  | ||||
| @ -32,6 +32,11 @@ type ChunkConfigForm = { | ||||
| 	chunk_overlap: number; | ||||
| }; | ||||
| 
 | ||||
| type TextExtractConfigForm = { | ||||
| 	engine: string; | ||||
| 	tika_server_url: string | null; | ||||
| }; | ||||
| 
 | ||||
| type YoutubeConfigForm = { | ||||
| 	language: string[]; | ||||
| 	translation?: string | null; | ||||
| @ -40,6 +45,7 @@ type YoutubeConfigForm = { | ||||
| type RAGConfigForm = { | ||||
| 	pdf_extract_images?: boolean; | ||||
| 	chunk?: ChunkConfigForm; | ||||
| 	text_extraction?: TextExtractConfigForm; | ||||
| 	web_loader_ssl_verification?: boolean; | ||||
| 	youtube?: YoutubeConfigForm; | ||||
| }; | ||||
|  | ||||
| @ -37,6 +37,10 @@ | ||||
| 	let embeddingModel = ''; | ||||
| 	let rerankingModel = ''; | ||||
| 
 | ||||
| 	let textExtractionEngine = 'default'; | ||||
| 	let tikaServerUrl = ''; | ||||
| 	let showTikaServerUrl = false; | ||||
| 
 | ||||
| 	let chunkSize = 0; | ||||
| 	let chunkOverlap = 0; | ||||
| 	let pdfExtractImages = true; | ||||
| @ -163,11 +167,20 @@ | ||||
| 			rerankingModelUpdateHandler(); | ||||
| 		} | ||||
| 
 | ||||
| 		if (textExtractionEngine === 'tika' && tikaServerUrl === '') { | ||||
| 			toast.error($i18n.t('Tika Server URL required.')); | ||||
| 			return; | ||||
| 		} | ||||
| 
 | ||||
| 		const res = await updateRAGConfig(localStorage.token, { | ||||
| 			pdf_extract_images: pdfExtractImages, | ||||
| 			chunk: { | ||||
| 				chunk_overlap: chunkOverlap, | ||||
| 				chunk_size: chunkSize | ||||
| 			}, | ||||
| 			text_extraction: { | ||||
| 				engine: textExtractionEngine, | ||||
| 				tika_server_url: tikaServerUrl | ||||
| 			} | ||||
| 		}); | ||||
| 
 | ||||
| @ -213,6 +226,10 @@ | ||||
| 
 | ||||
| 			chunkSize = res.chunk.chunk_size; | ||||
| 			chunkOverlap = res.chunk.chunk_overlap; | ||||
| 
 | ||||
| 			textExtractionEngine = res.text_extraction.engine; | ||||
| 			tikaServerUrl = res.text_extraction.tika_server_url; | ||||
| 			showTikaServerUrl = textExtractionEngine === 'tika'; | ||||
| 		} | ||||
| 	}); | ||||
| </script> | ||||
| @ -388,6 +405,39 @@ | ||||
| 			</div> | ||||
| 		</div> | ||||
| 
 | ||||
| 		<hr class="dark:border-gray-850" /> | ||||
| 
 | ||||
| 		<div class=""> | ||||
| 			<div class="text-sm font-medium">{$i18n.t('Text Extraction')}</div> | ||||
| 
 | ||||
| 			<div class="flex w-full justify-between mt-2"> | ||||
| 				<div class="self-center text-xs font-medium">{$i18n.t('Engine')}</div> | ||||
| 				<div class="flex items-center relative"> | ||||
| 					<select | ||||
| 							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" | ||||
| 							bind:value={textExtractionEngine} | ||||
| 							on:change={(e) => { | ||||
| 								showTikaServerUrl = (e.target.value === 'tika'); | ||||
| 							}} | ||||
| 					> | ||||
| 						<option value="default">{$i18n.t('Default')}</option> | ||||
| 						<option value="tika">{$i18n.t('Tika')}</option> | ||||
| 					</select> | ||||
| 				</div> | ||||
| 			</div> | ||||
| 
 | ||||
| 			{#if showTikaServerUrl} | ||||
| 				<div class="flex w-full mt-2"> | ||||
| 					<div class="flex-1 mr-2"> | ||||
| 						<input | ||||
| 								class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none" | ||||
| 								placeholder={$i18n.t('Enter Tika Server URL')} | ||||
| 								bind:value={tikaServerUrl} | ||||
| 						/> | ||||
| 					</div> | ||||
| 				</div> | ||||
| 			{/if} | ||||
| 		</div> | ||||
| 		<hr class=" dark:border-gray-850 my-1" /> | ||||
| 
 | ||||
| 		<div class="space-y-2" /> | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user