mirror of
https://github.com/open-webui/open-webui
synced 2025-06-26 18:26:48 +00:00
Update retrieval.py
Added semantic chunking for better document structure preservation
This commit is contained in:
committed by
GitHub
parent
bf7a18a0f8
commit
6168310ec7
@@ -4,7 +4,8 @@ import mimetypes
|
||||
import os
|
||||
import shutil
|
||||
import asyncio
|
||||
|
||||
import re
|
||||
from typing import List as TypingList
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
@@ -984,28 +985,37 @@ def save_docs_to_vector_db(
|
||||
raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
|
||||
|
||||
if split:
|
||||
if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=request.app.state.config.CHUNK_SIZE,
|
||||
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
||||
add_start_index=True,
|
||||
# Apply advanced content-aware splitting and text cleaning
|
||||
processed_docs = []
|
||||
|
||||
for doc in docs:
|
||||
# Clean the text content before chunking
|
||||
if not doc.page_content:
|
||||
continue
|
||||
|
||||
# Apply text cleaning before chunking
|
||||
cleaned_content = clean_text_content(doc.page_content)
|
||||
|
||||
# Create semantic chunks from cleaned content
|
||||
chunks = create_semantic_chunks(
|
||||
cleaned_content,
|
||||
request.app.state.config.CHUNK_SIZE,
|
||||
request.app.state.config.CHUNK_OVERLAP
|
||||
)
|
||||
elif request.app.state.config.TEXT_SPLITTER == "token":
|
||||
log.info(
|
||||
f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
|
||||
)
|
||||
|
||||
tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
|
||||
text_splitter = TokenTextSplitter(
|
||||
encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME),
|
||||
chunk_size=request.app.state.config.CHUNK_SIZE,
|
||||
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
||||
add_start_index=True,
|
||||
)
|
||||
else:
|
||||
raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
|
||||
|
||||
docs = text_splitter.split_documents(docs)
|
||||
|
||||
# Create new documents for each chunk
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_metadata = {
|
||||
**doc.metadata,
|
||||
"chunk_index": i,
|
||||
"total_chunks": len(chunks)
|
||||
}
|
||||
processed_docs.append(Document(
|
||||
page_content=chunk,
|
||||
metadata=chunk_metadata
|
||||
))
|
||||
|
||||
docs = processed_docs
|
||||
|
||||
if len(docs) == 0:
|
||||
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
||||
@@ -1067,21 +1077,46 @@ def save_docs_to_vector_db(
|
||||
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
|
||||
)
|
||||
|
||||
# Apply final text cleaning for embedding (text already cleaned during chunking)
|
||||
cleaned_texts = []
|
||||
for i, text in enumerate(texts):
|
||||
# Text is already cleaned, just flatten for embedding (convert line breaks to spaces)
|
||||
cleaned_text = re.sub(r'\n+', ' ', text)
|
||||
cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Final whitespace normalization
|
||||
cleaned_text = cleaned_text.strip()
|
||||
cleaned_texts.append(cleaned_text)
|
||||
|
||||
embeddings = embedding_function(
|
||||
list(map(lambda x: x.replace("\n", " "), texts)),
|
||||
cleaned_texts,
|
||||
prefix=RAG_EMBEDDING_CONTENT_PREFIX,
|
||||
user=user,
|
||||
)
|
||||
|
||||
items = [
|
||||
{
|
||||
# Store the fully cleaned text - apply final aggressive cleaning for storage
|
||||
items = []
|
||||
for idx in range(len(texts)):
|
||||
# Apply final aggressive cleaning specifically for storage
|
||||
text_to_store = texts[idx]
|
||||
|
||||
# Convert ALL newlines to spaces for storage (preserve readability but remove line breaks)
|
||||
text_to_store = re.sub(r'\n+', ' ', text_to_store)
|
||||
text_to_store = re.sub(r'\s+', ' ', text_to_store) # Normalize all whitespace
|
||||
|
||||
# Final aggressive quote cleaning for storage
|
||||
text_to_store = re.sub(r'\\+"', '"', text_to_store) # Multiple backslashes before quotes
|
||||
text_to_store = re.sub(r'\\"', '"', text_to_store) # Any escaped quotes
|
||||
text_to_store = re.sub(r"\\'", "'", text_to_store) # Any escaped single quotes
|
||||
text_to_store = re.sub(r'\\&', '&', text_to_store) # Escaped ampersands
|
||||
text_to_store = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text_to_store) # Any other escaped special chars
|
||||
|
||||
text_to_store = text_to_store.strip()
|
||||
|
||||
items.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"text": text,
|
||||
"text": text_to_store,
|
||||
"vector": embeddings[idx],
|
||||
"metadata": metadatas[idx],
|
||||
}
|
||||
for idx, text in enumerate(texts)
|
||||
]
|
||||
})
|
||||
|
||||
VECTOR_DB_CLIENT.insert(
|
||||
collection_name=collection_name,
|
||||
@@ -1127,7 +1162,7 @@ def process_file(
|
||||
|
||||
docs = [
|
||||
Document(
|
||||
page_content=form_data.content.replace("<br/>", "\n"),
|
||||
page_content=clean_text_content(form_data.content.replace("<br/>", "\n")),
|
||||
metadata={
|
||||
**file.meta,
|
||||
"name": file.filename,
|
||||
@@ -1150,7 +1185,7 @@ def process_file(
|
||||
if result is not None and len(result.ids[0]) > 0:
|
||||
docs = [
|
||||
Document(
|
||||
page_content=result.documents[0][idx],
|
||||
page_content=clean_text_content(result.documents[0][idx]),
|
||||
metadata=result.metadatas[0][idx],
|
||||
)
|
||||
for idx, id in enumerate(result.ids[0])
|
||||
@@ -1158,7 +1193,7 @@ def process_file(
|
||||
else:
|
||||
docs = [
|
||||
Document(
|
||||
page_content=file.data.get("content", ""),
|
||||
page_content=clean_text_content(file.data.get("content", "")),
|
||||
metadata={
|
||||
**file.meta,
|
||||
"name": file.filename,
|
||||
@@ -1194,9 +1229,13 @@ def process_file(
|
||||
file.filename, file.meta.get("content_type"), file_path
|
||||
)
|
||||
|
||||
docs = [
|
||||
Document(
|
||||
page_content=doc.page_content,
|
||||
# Clean the loaded documents before processing
|
||||
cleaned_docs = []
|
||||
for doc in docs:
|
||||
cleaned_content = clean_text_content(doc.page_content)
|
||||
|
||||
cleaned_docs.append(Document(
|
||||
page_content=cleaned_content,
|
||||
metadata={
|
||||
**doc.metadata,
|
||||
"name": file.filename,
|
||||
@@ -1204,13 +1243,12 @@ def process_file(
|
||||
"file_id": file.id,
|
||||
"source": file.filename,
|
||||
},
|
||||
)
|
||||
for doc in docs
|
||||
]
|
||||
))
|
||||
docs = cleaned_docs
|
||||
else:
|
||||
docs = [
|
||||
Document(
|
||||
page_content=file.data.get("content", ""),
|
||||
page_content=clean_text_content(file.data.get("content", "")),
|
||||
metadata={
|
||||
**file.meta,
|
||||
"name": file.filename,
|
||||
@@ -1302,7 +1340,7 @@ def process_text(
|
||||
|
||||
docs = [
|
||||
Document(
|
||||
page_content=form_data.content,
|
||||
page_content=clean_text_content(form_data.content),
|
||||
metadata={"name": form_data.name, "created_by": user.id},
|
||||
)
|
||||
]
|
||||
@@ -1955,6 +1993,7 @@ if ENV == "dev":
|
||||
}
|
||||
|
||||
|
||||
|
||||
class BatchProcessFilesForm(BaseModel):
|
||||
files: List[FileModel]
|
||||
collection_name: str
|
||||
@@ -1992,7 +2031,7 @@ def process_files_batch(
|
||||
|
||||
docs: List[Document] = [
|
||||
Document(
|
||||
page_content=text_content.replace("<br/>", "\n"),
|
||||
page_content=clean_text_content(text_content.replace("<br/>", "\n")),
|
||||
metadata={
|
||||
**file.meta,
|
||||
"name": file.filename,
|
||||
@@ -2045,3 +2084,167 @@ def process_files_batch(
|
||||
)
|
||||
|
||||
return BatchProcessFilesResponse(results=results, errors=errors)
|
||||
|
||||
|
||||
def clean_text_content(text: str) -> str:
|
||||
"""Simple, effective text cleaning with special handling for PPTX artifacts"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Step 1: PPTX-specific cleaning - handle double-escaped sequences first
|
||||
text = text.replace('\\\\n', '\n') # Double-escaped newlines in PPTX
|
||||
text = text.replace('\\\\t', ' ') # Double-escaped tabs in PPTX
|
||||
text = text.replace('\\\\"', '"') # Double-escaped quotes in PPTX
|
||||
|
||||
# Step 2: Standard escape sequences
|
||||
text = text.replace('\\n', '\n') # Single-escaped newlines
|
||||
text = text.replace('\\t', ' ') # Single-escaped tabs to spaces
|
||||
text = text.replace('\\"', '"') # Single-escaped quotes
|
||||
text = text.replace('\\\'', "'") # Single-escaped single quotes
|
||||
text = text.replace('\\r', '') # Remove escaped carriage returns
|
||||
text = text.replace('\\/', '/') # Convert escaped slashes
|
||||
text = text.replace('\\\\', '\\') # Convert double backslashes
|
||||
|
||||
# Step 3: Remove any remaining backslash artifacts
|
||||
text = re.sub(r'\\[a-zA-Z]', '', text) # Remove \letter patterns
|
||||
text = re.sub(r'\\[0-9]', '', text) # Remove \number patterns
|
||||
text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns
|
||||
|
||||
# Step 4: PPTX-specific artifacts cleanup
|
||||
text = re.sub(r'\s*\\n\s*', '\n', text) # Clean up any remaining \\n with spaces
|
||||
text = re.sub(r'\\+', '', text) # Remove any remaining multiple backslashes
|
||||
|
||||
# Step 5: Fix Unicode and special characters
|
||||
unicode_replacements = [
|
||||
('–', '-'), # En dash to hyphen
|
||||
('—', '-'), # Em dash to hyphen
|
||||
(''', "'"), # Smart single quotes
|
||||
(''', "'"), # Smart single quotes
|
||||
('"', '"'), # Smart double quotes
|
||||
('"', '"'), # Smart double quotes
|
||||
('…', '...'), # Ellipsis to three dots
|
||||
]
|
||||
|
||||
for old_char, new_char in unicode_replacements:
|
||||
if old_char in text:
|
||||
text = text.replace(old_char, new_char)
|
||||
|
||||
# Step 6: Clean up spacing and formatting
|
||||
text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs -> single space
|
||||
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break
|
||||
text = re.sub(r'^\s+|\s+$', '', text) # Remove leading/trailing whitespace
|
||||
|
||||
# Step 7: Additional quote cleaning
|
||||
text = re.sub(r'\\+"', '"', text) # Multiple backslashes before quotes
|
||||
text = re.sub(r'\\"', '"', text) # Any remaining escaped quotes
|
||||
text = re.sub(r"\\'", "'", text) # Any remaining escaped single quotes
|
||||
|
||||
# Step 8: Fix orphaned punctuation
|
||||
text = re.sub(r'^\s*[)\]}]+\s*', '', text) # Remove orphaned closing brackets/parens at start
|
||||
text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text) # Remove orphaned closing brackets on their own lines
|
||||
|
||||
return text
|
||||
|
||||
def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
|
||||
"""Create semantically aware chunks that respect document structure"""
|
||||
if not text or len(text) <= max_chunk_size:
|
||||
return [text] if text else []
|
||||
|
||||
chunks = []
|
||||
|
||||
# Split by double line breaks (paragraphs) first
|
||||
paragraphs = text.split('\n\n')
|
||||
|
||||
current_chunk = ""
|
||||
|
||||
for paragraph in paragraphs:
|
||||
paragraph = paragraph.strip()
|
||||
if not paragraph:
|
||||
continue
|
||||
|
||||
# If adding this paragraph would exceed chunk size
|
||||
if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
|
||||
# Try to split the current chunk at sentence boundaries if it's too long
|
||||
if len(current_chunk) > max_chunk_size:
|
||||
sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
|
||||
chunks.extend(sentence_chunks)
|
||||
else:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
# Start new chunk with overlap from previous chunk if applicable
|
||||
if chunks and overlap_size > 0:
|
||||
prev_chunk = chunks[-1]
|
||||
overlap_text = get_text_overlap(prev_chunk, overlap_size)
|
||||
current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
|
||||
else:
|
||||
current_chunk = paragraph
|
||||
else:
|
||||
# Add paragraph to current chunk
|
||||
if current_chunk:
|
||||
current_chunk += "\n\n" + paragraph
|
||||
else:
|
||||
current_chunk = paragraph
|
||||
|
||||
# Add the last chunk
|
||||
if current_chunk:
|
||||
if len(current_chunk) > max_chunk_size:
|
||||
sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
|
||||
chunks.extend(sentence_chunks)
|
||||
else:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
return [chunk for chunk in chunks if chunk.strip()]
|
||||
|
||||
def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
|
||||
"""Split text by sentences when paragraph-level splitting isn't sufficient"""
|
||||
# Split by sentence endings
|
||||
sentences = re.split(r'(?<=[.!?])\s+', text)
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for sentence in sentences:
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
# If adding this sentence would exceed chunk size
|
||||
if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
# Start new chunk with overlap
|
||||
if overlap_size > 0:
|
||||
overlap_text = get_text_overlap(current_chunk, overlap_size)
|
||||
current_chunk = overlap_text + " " + sentence if overlap_text else sentence
|
||||
else:
|
||||
current_chunk = sentence
|
||||
else:
|
||||
# Add sentence to current chunk
|
||||
if current_chunk:
|
||||
current_chunk += " " + sentence
|
||||
else:
|
||||
current_chunk = sentence
|
||||
|
||||
# Add the last chunk
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
return [chunk for chunk in chunks if chunk.strip()]
|
||||
|
||||
def get_text_overlap(text: str, overlap_size: int) -> str:
|
||||
"""Get the last overlap_size characters from text, preferring word boundaries"""
|
||||
if not text or overlap_size <= 0:
|
||||
return ""
|
||||
|
||||
if len(text) <= overlap_size:
|
||||
return text
|
||||
|
||||
# Try to find a good word boundary within the overlap region
|
||||
overlap_text = text[-overlap_size:]
|
||||
|
||||
# Find the first space to avoid cutting words
|
||||
space_index = overlap_text.find(' ')
|
||||
if space_index > 0:
|
||||
return overlap_text[space_index:].strip()
|
||||
|
||||
return overlap_text.strip()
|
||||
|
||||
Reference in New Issue
Block a user