enh: token text splitter support

This commit is contained in:
Timothy J. Baek 2024-10-13 04:24:13 -07:00
parent 8a0da6d376
commit 586e005f0f
3 changed files with 32 additions and 7 deletions

View File

@ -392,18 +392,19 @@ async def get_rag_config(user=Depends(get_admin_user)):
return {
"status": True,
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
"file": {
"max_size": app.state.config.FILE_MAX_SIZE,
"max_count": app.state.config.FILE_MAX_COUNT,
},
"content_extraction": {
"engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": app.state.config.TIKA_SERVER_URL,
},
"chunk": {
"text_splitter": app.state.config.TEXT_SPLITTER,
"chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
},
"file": {
"max_size": app.state.config.FILE_MAX_SIZE,
"max_count": app.state.config.FILE_MAX_COUNT,
},
"youtube": {
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
@ -442,6 +443,7 @@ class ContentExtractionConfig(BaseModel):
class ChunkParamUpdateForm(BaseModel):
text_splitter: Optional[str] = None
chunk_size: int
chunk_overlap: int
@ -501,6 +503,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url
if form_data.chunk is not None:
app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
@ -547,6 +550,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
"tika_server_url": app.state.config.TIKA_SERVER_URL,
},
"chunk": {
"text_splitter": app.state.config.TEXT_SPLITTER,
"chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
},
@ -607,11 +611,10 @@ class QuerySettingsForm(BaseModel):
async def update_query_settings(
form_data: QuerySettingsForm, user=Depends(get_admin_user)
):
app.state.config.RAG_TEMPLATE = (
form_data.template if form_data.template != "" else DEFAULT_RAG_TEMPLATE
)
app.state.config.RAG_TEMPLATE = form_data.template
app.state.config.TOP_K = form_data.k if form_data.k else 4
app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
form_data.hybrid if form_data.hybrid else False
)

View File

@ -19,6 +19,7 @@ from open_webui.apps.retrieval.vector.connector import VECTOR_DB_CLIENT
from open_webui.utils.misc import get_last_user_message
from open_webui.env import SRC_LOG_LEVELS
from open_webui.config import DEFAULT_RAG_TEMPLATE
log = logging.getLogger(__name__)
@ -239,6 +240,9 @@ def query_collection_with_hybrid_search(
def rag_template(template: str, context: str, query: str):
if template == "":
template = DEFAULT_RAG_TEMPLATE
if "[context]" not in template and "{{CONTEXT}}" not in template:
log.debug(
"WARNING: The RAG template does not contain the '[context]' or '{{CONTEXT}}' placeholder."

View File

@ -27,6 +27,7 @@
import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
import Tooltip from '$lib/components/common/Tooltip.svelte';
import Switch from '$lib/components/common/Switch.svelte';
import { text } from '@sveltejs/kit';
const i18n = getContext('i18n');
@ -49,6 +50,7 @@
let tikaServerUrl = '';
let showTikaServerUrl = false;
let textSplitter = '';
let chunkSize = 0;
let chunkOverlap = 0;
let pdfExtractImages = true;
@ -178,6 +180,7 @@
max_count: fileMaxCount === '' ? null : fileMaxCount
},
chunk: {
text_splitter: textSplitter,
chunk_overlap: chunkOverlap,
chunk_size: chunkSize
},
@ -223,11 +226,13 @@
await setRerankingConfig();
querySettings = await getQuerySettings(localStorage.token);
const res = await getRAGConfig(localStorage.token);
if (res) {
pdfExtractImages = res.pdf_extract_images;
textSplitter = res.chunk.text_splitter;
chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap;
@ -639,6 +644,19 @@
<div class=" ">
<div class="mb-1 text-sm font-medium">{$i18n.t('Chunk Params')}</div>
<div class="flex w-full justify-between mb-1.5">
<div class="self-center text-xs font-medium">{$i18n.t('Text Splitter')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 text-xs bg-transparent outline-none text-right"
bind:value={textSplitter}
>
<option value="">{$i18n.t('Default (Character)')} </option>
<option value="token">{$i18n.t('Token (Tiktoken)')}</option>
</select>
</div>
</div>
<div class=" flex gap-1.5">
<div class=" w-full justify-between">
<div class="self-center text-xs font-medium min-w-fit mb-1">{$i18n.t('Chunk Size')}</div>