enh: token text splitter support

This commit is contained in:
Timothy J. Baek 2024-10-13 04:24:13 -07:00
parent 8a0da6d376
commit 586e005f0f
3 changed files with 32 additions and 7 deletions

View File

@ -392,18 +392,19 @@ async def get_rag_config(user=Depends(get_admin_user)):
return { return {
"status": True, "status": True,
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
"file": {
"max_size": app.state.config.FILE_MAX_SIZE,
"max_count": app.state.config.FILE_MAX_COUNT,
},
"content_extraction": { "content_extraction": {
"engine": app.state.config.CONTENT_EXTRACTION_ENGINE, "engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": app.state.config.TIKA_SERVER_URL, "tika_server_url": app.state.config.TIKA_SERVER_URL,
}, },
"chunk": { "chunk": {
"text_splitter": app.state.config.TEXT_SPLITTER,
"chunk_size": app.state.config.CHUNK_SIZE, "chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP, "chunk_overlap": app.state.config.CHUNK_OVERLAP,
}, },
"file": {
"max_size": app.state.config.FILE_MAX_SIZE,
"max_count": app.state.config.FILE_MAX_COUNT,
},
"youtube": { "youtube": {
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE, "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
"translation": app.state.YOUTUBE_LOADER_TRANSLATION, "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
@ -442,6 +443,7 @@ class ContentExtractionConfig(BaseModel):
class ChunkParamUpdateForm(BaseModel): class ChunkParamUpdateForm(BaseModel):
text_splitter: Optional[str] = None
chunk_size: int chunk_size: int
chunk_overlap: int chunk_overlap: int
@ -501,6 +503,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url
if form_data.chunk is not None: if form_data.chunk is not None:
app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
@ -547,6 +550,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
"tika_server_url": app.state.config.TIKA_SERVER_URL, "tika_server_url": app.state.config.TIKA_SERVER_URL,
}, },
"chunk": { "chunk": {
"text_splitter": app.state.config.TEXT_SPLITTER,
"chunk_size": app.state.config.CHUNK_SIZE, "chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP, "chunk_overlap": app.state.config.CHUNK_OVERLAP,
}, },
@ -607,11 +611,10 @@ class QuerySettingsForm(BaseModel):
async def update_query_settings( async def update_query_settings(
form_data: QuerySettingsForm, user=Depends(get_admin_user) form_data: QuerySettingsForm, user=Depends(get_admin_user)
): ):
app.state.config.RAG_TEMPLATE = ( app.state.config.RAG_TEMPLATE = form_data.template
form_data.template if form_data.template != "" else DEFAULT_RAG_TEMPLATE
)
app.state.config.TOP_K = form_data.k if form_data.k else 4 app.state.config.TOP_K = form_data.k if form_data.k else 4
app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0 app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ( app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
form_data.hybrid if form_data.hybrid else False form_data.hybrid if form_data.hybrid else False
) )

View File

@ -19,6 +19,7 @@ from open_webui.apps.retrieval.vector.connector import VECTOR_DB_CLIENT
from open_webui.utils.misc import get_last_user_message from open_webui.utils.misc import get_last_user_message
from open_webui.env import SRC_LOG_LEVELS from open_webui.env import SRC_LOG_LEVELS
from open_webui.config import DEFAULT_RAG_TEMPLATE
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -239,6 +240,9 @@ def query_collection_with_hybrid_search(
def rag_template(template: str, context: str, query: str): def rag_template(template: str, context: str, query: str):
if template == "":
template = DEFAULT_RAG_TEMPLATE
if "[context]" not in template and "{{CONTEXT}}" not in template: if "[context]" not in template and "{{CONTEXT}}" not in template:
log.debug( log.debug(
"WARNING: The RAG template does not contain the '[context]' or '{{CONTEXT}}' placeholder." "WARNING: The RAG template does not contain the '[context]' or '{{CONTEXT}}' placeholder."

View File

@ -27,6 +27,7 @@
import SensitiveInput from '$lib/components/common/SensitiveInput.svelte'; import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
import Tooltip from '$lib/components/common/Tooltip.svelte'; import Tooltip from '$lib/components/common/Tooltip.svelte';
import Switch from '$lib/components/common/Switch.svelte'; import Switch from '$lib/components/common/Switch.svelte';
import { text } from '@sveltejs/kit';
const i18n = getContext('i18n'); const i18n = getContext('i18n');
@ -49,6 +50,7 @@
let tikaServerUrl = ''; let tikaServerUrl = '';
let showTikaServerUrl = false; let showTikaServerUrl = false;
let textSplitter = '';
let chunkSize = 0; let chunkSize = 0;
let chunkOverlap = 0; let chunkOverlap = 0;
let pdfExtractImages = true; let pdfExtractImages = true;
@ -178,6 +180,7 @@
max_count: fileMaxCount === '' ? null : fileMaxCount max_count: fileMaxCount === '' ? null : fileMaxCount
}, },
chunk: { chunk: {
text_splitter: textSplitter,
chunk_overlap: chunkOverlap, chunk_overlap: chunkOverlap,
chunk_size: chunkSize chunk_size: chunkSize
}, },
@ -223,11 +226,13 @@
await setRerankingConfig(); await setRerankingConfig();
querySettings = await getQuerySettings(localStorage.token); querySettings = await getQuerySettings(localStorage.token);
const res = await getRAGConfig(localStorage.token); const res = await getRAGConfig(localStorage.token);
if (res) { if (res) {
pdfExtractImages = res.pdf_extract_images; pdfExtractImages = res.pdf_extract_images;
textSplitter = res.chunk.text_splitter;
chunkSize = res.chunk.chunk_size; chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap; chunkOverlap = res.chunk.chunk_overlap;
@ -639,6 +644,19 @@
<div class=" "> <div class=" ">
<div class="mb-1 text-sm font-medium">{$i18n.t('Chunk Params')}</div> <div class="mb-1 text-sm font-medium">{$i18n.t('Chunk Params')}</div>
<div class="flex w-full justify-between mb-1.5">
<div class="self-center text-xs font-medium">{$i18n.t('Text Splitter')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 text-xs bg-transparent outline-none text-right"
bind:value={textSplitter}
>
<option value="">{$i18n.t('Default (Character)')} </option>
<option value="token">{$i18n.t('Token (Tiktoken)')}</option>
</select>
</div>
</div>
<div class=" flex gap-1.5"> <div class=" flex gap-1.5">
<div class=" w-full justify-between"> <div class=" w-full justify-between">
<div class="self-center text-xs font-medium min-w-fit mb-1">{$i18n.t('Chunk Size')}</div> <div class="self-center text-xs font-medium min-w-fit mb-1">{$i18n.t('Chunk Size')}</div>