mirror of
https://github.com/open-webui/open-webui
synced 2025-01-31 06:49:03 +00:00
enh: token text splitter support
This commit is contained in:
parent
8a0da6d376
commit
586e005f0f
@ -392,18 +392,19 @@ async def get_rag_config(user=Depends(get_admin_user)):
|
|||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
|
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
"file": {
|
|
||||||
"max_size": app.state.config.FILE_MAX_SIZE,
|
|
||||||
"max_count": app.state.config.FILE_MAX_COUNT,
|
|
||||||
},
|
|
||||||
"content_extraction": {
|
"content_extraction": {
|
||||||
"engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
|
"engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
"tika_server_url": app.state.config.TIKA_SERVER_URL,
|
"tika_server_url": app.state.config.TIKA_SERVER_URL,
|
||||||
},
|
},
|
||||||
"chunk": {
|
"chunk": {
|
||||||
|
"text_splitter": app.state.config.TEXT_SPLITTER,
|
||||||
"chunk_size": app.state.config.CHUNK_SIZE,
|
"chunk_size": app.state.config.CHUNK_SIZE,
|
||||||
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
|
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
|
||||||
},
|
},
|
||||||
|
"file": {
|
||||||
|
"max_size": app.state.config.FILE_MAX_SIZE,
|
||||||
|
"max_count": app.state.config.FILE_MAX_COUNT,
|
||||||
|
},
|
||||||
"youtube": {
|
"youtube": {
|
||||||
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
"language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
||||||
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
|
"translation": app.state.YOUTUBE_LOADER_TRANSLATION,
|
||||||
@ -442,6 +443,7 @@ class ContentExtractionConfig(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class ChunkParamUpdateForm(BaseModel):
|
class ChunkParamUpdateForm(BaseModel):
|
||||||
|
text_splitter: Optional[str] = None
|
||||||
chunk_size: int
|
chunk_size: int
|
||||||
chunk_overlap: int
|
chunk_overlap: int
|
||||||
|
|
||||||
@ -501,6 +503,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
|
|||||||
app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url
|
app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url
|
||||||
|
|
||||||
if form_data.chunk is not None:
|
if form_data.chunk is not None:
|
||||||
|
app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
|
||||||
app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
|
app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
|
||||||
app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
|
app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
|
||||||
|
|
||||||
@ -547,6 +550,7 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
|
|||||||
"tika_server_url": app.state.config.TIKA_SERVER_URL,
|
"tika_server_url": app.state.config.TIKA_SERVER_URL,
|
||||||
},
|
},
|
||||||
"chunk": {
|
"chunk": {
|
||||||
|
"text_splitter": app.state.config.TEXT_SPLITTER,
|
||||||
"chunk_size": app.state.config.CHUNK_SIZE,
|
"chunk_size": app.state.config.CHUNK_SIZE,
|
||||||
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
|
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
|
||||||
},
|
},
|
||||||
@ -607,11 +611,10 @@ class QuerySettingsForm(BaseModel):
|
|||||||
async def update_query_settings(
|
async def update_query_settings(
|
||||||
form_data: QuerySettingsForm, user=Depends(get_admin_user)
|
form_data: QuerySettingsForm, user=Depends(get_admin_user)
|
||||||
):
|
):
|
||||||
app.state.config.RAG_TEMPLATE = (
|
app.state.config.RAG_TEMPLATE = form_data.template
|
||||||
form_data.template if form_data.template != "" else DEFAULT_RAG_TEMPLATE
|
|
||||||
)
|
|
||||||
app.state.config.TOP_K = form_data.k if form_data.k else 4
|
app.state.config.TOP_K = form_data.k if form_data.k else 4
|
||||||
app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
|
app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
|
||||||
|
|
||||||
app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
|
app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
|
||||||
form_data.hybrid if form_data.hybrid else False
|
form_data.hybrid if form_data.hybrid else False
|
||||||
)
|
)
|
||||||
|
@ -19,6 +19,7 @@ from open_webui.apps.retrieval.vector.connector import VECTOR_DB_CLIENT
|
|||||||
from open_webui.utils.misc import get_last_user_message
|
from open_webui.utils.misc import get_last_user_message
|
||||||
|
|
||||||
from open_webui.env import SRC_LOG_LEVELS
|
from open_webui.env import SRC_LOG_LEVELS
|
||||||
|
from open_webui.config import DEFAULT_RAG_TEMPLATE
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@ -239,6 +240,9 @@ def query_collection_with_hybrid_search(
|
|||||||
|
|
||||||
|
|
||||||
def rag_template(template: str, context: str, query: str):
|
def rag_template(template: str, context: str, query: str):
|
||||||
|
if template == "":
|
||||||
|
template = DEFAULT_RAG_TEMPLATE
|
||||||
|
|
||||||
if "[context]" not in template and "{{CONTEXT}}" not in template:
|
if "[context]" not in template and "{{CONTEXT}}" not in template:
|
||||||
log.debug(
|
log.debug(
|
||||||
"WARNING: The RAG template does not contain the '[context]' or '{{CONTEXT}}' placeholder."
|
"WARNING: The RAG template does not contain the '[context]' or '{{CONTEXT}}' placeholder."
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
|
import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
|
||||||
import Tooltip from '$lib/components/common/Tooltip.svelte';
|
import Tooltip from '$lib/components/common/Tooltip.svelte';
|
||||||
import Switch from '$lib/components/common/Switch.svelte';
|
import Switch from '$lib/components/common/Switch.svelte';
|
||||||
|
import { text } from '@sveltejs/kit';
|
||||||
|
|
||||||
const i18n = getContext('i18n');
|
const i18n = getContext('i18n');
|
||||||
|
|
||||||
@ -49,6 +50,7 @@
|
|||||||
let tikaServerUrl = '';
|
let tikaServerUrl = '';
|
||||||
let showTikaServerUrl = false;
|
let showTikaServerUrl = false;
|
||||||
|
|
||||||
|
let textSplitter = '';
|
||||||
let chunkSize = 0;
|
let chunkSize = 0;
|
||||||
let chunkOverlap = 0;
|
let chunkOverlap = 0;
|
||||||
let pdfExtractImages = true;
|
let pdfExtractImages = true;
|
||||||
@ -178,6 +180,7 @@
|
|||||||
max_count: fileMaxCount === '' ? null : fileMaxCount
|
max_count: fileMaxCount === '' ? null : fileMaxCount
|
||||||
},
|
},
|
||||||
chunk: {
|
chunk: {
|
||||||
|
text_splitter: textSplitter,
|
||||||
chunk_overlap: chunkOverlap,
|
chunk_overlap: chunkOverlap,
|
||||||
chunk_size: chunkSize
|
chunk_size: chunkSize
|
||||||
},
|
},
|
||||||
@ -223,11 +226,13 @@
|
|||||||
await setRerankingConfig();
|
await setRerankingConfig();
|
||||||
|
|
||||||
querySettings = await getQuerySettings(localStorage.token);
|
querySettings = await getQuerySettings(localStorage.token);
|
||||||
|
|
||||||
const res = await getRAGConfig(localStorage.token);
|
const res = await getRAGConfig(localStorage.token);
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
pdfExtractImages = res.pdf_extract_images;
|
pdfExtractImages = res.pdf_extract_images;
|
||||||
|
|
||||||
|
textSplitter = res.chunk.text_splitter;
|
||||||
chunkSize = res.chunk.chunk_size;
|
chunkSize = res.chunk.chunk_size;
|
||||||
chunkOverlap = res.chunk.chunk_overlap;
|
chunkOverlap = res.chunk.chunk_overlap;
|
||||||
|
|
||||||
@ -639,6 +644,19 @@
|
|||||||
<div class=" ">
|
<div class=" ">
|
||||||
<div class="mb-1 text-sm font-medium">{$i18n.t('Chunk Params')}</div>
|
<div class="mb-1 text-sm font-medium">{$i18n.t('Chunk Params')}</div>
|
||||||
|
|
||||||
|
<div class="flex w-full justify-between mb-1.5">
|
||||||
|
<div class="self-center text-xs font-medium">{$i18n.t('Text Splitter')}</div>
|
||||||
|
<div class="flex items-center relative">
|
||||||
|
<select
|
||||||
|
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 text-xs bg-transparent outline-none text-right"
|
||||||
|
bind:value={textSplitter}
|
||||||
|
>
|
||||||
|
<option value="">{$i18n.t('Default (Character)')} </option>
|
||||||
|
<option value="token">{$i18n.t('Token (Tiktoken)')}</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class=" flex gap-1.5">
|
<div class=" flex gap-1.5">
|
||||||
<div class=" w-full justify-between">
|
<div class=" w-full justify-between">
|
||||||
<div class="self-center text-xs font-medium min-w-fit mb-1">{$i18n.t('Chunk Size')}</div>
|
<div class="self-center text-xs font-medium min-w-fit mb-1">{$i18n.t('Chunk Size')}</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user