From 6168310ec7c82d1363b21dd1efa6ec9ed6edb796 Mon Sep 17 00:00:00 2001 From: PVBLIC Foundation Date: Tue, 27 May 2025 18:07:04 -0700 Subject: [PATCH] Update retrieval.py Added semantic chunking for better document structure preservation --- backend/open_webui/routers/retrieval.py | 285 ++++++++++++++++++++---- 1 file changed, 244 insertions(+), 41 deletions(-) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 98f79c7fe..947b7ed49 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -4,7 +4,8 @@ import mimetypes import os import shutil import asyncio - +import re +from typing import List as TypingList import uuid from datetime import datetime @@ -984,28 +985,37 @@ def save_docs_to_vector_db( raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT) if split: - if request.app.state.config.TEXT_SPLITTER in ["", "character"]: - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=request.app.state.config.CHUNK_SIZE, - chunk_overlap=request.app.state.config.CHUNK_OVERLAP, - add_start_index=True, + # Apply advanced content-aware splitting and text cleaning + processed_docs = [] + + for doc in docs: + # Clean the text content before chunking + if not doc.page_content: + continue + + # Apply text cleaning before chunking + cleaned_content = clean_text_content(doc.page_content) + + # Create semantic chunks from cleaned content + chunks = create_semantic_chunks( + cleaned_content, + request.app.state.config.CHUNK_SIZE, + request.app.state.config.CHUNK_OVERLAP ) - elif request.app.state.config.TEXT_SPLITTER == "token": - log.info( - f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}" - ) - - tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME)) - text_splitter = TokenTextSplitter( - encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME), - chunk_size=request.app.state.config.CHUNK_SIZE, - chunk_overlap=request.app.state.config.CHUNK_OVERLAP, - add_start_index=True, - ) - else: - raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter")) - - docs = text_splitter.split_documents(docs) + + # Create new documents for each chunk + for i, chunk in enumerate(chunks): + chunk_metadata = { + **doc.metadata, + "chunk_index": i, + "total_chunks": len(chunks) + } + processed_docs.append(Document( + page_content=chunk, + metadata=chunk_metadata + )) + + docs = processed_docs if len(docs) == 0: raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) @@ -1067,21 +1077,46 @@ def save_docs_to_vector_db( request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, ) + # Apply final text cleaning for embedding (text already cleaned during chunking) + cleaned_texts = [] + for i, text in enumerate(texts): + # Text is already cleaned, just flatten for embedding (convert line breaks to spaces) + cleaned_text = re.sub(r'\n+', ' ', text) + cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Final whitespace normalization + cleaned_text = cleaned_text.strip() + cleaned_texts.append(cleaned_text) + embeddings = embedding_function( - list(map(lambda x: x.replace("\n", " "), texts)), + cleaned_texts, prefix=RAG_EMBEDDING_CONTENT_PREFIX, user=user, ) - items = [ - { + # Store the fully cleaned text - apply final aggressive cleaning for storage + items = [] + for idx in range(len(texts)): + # Apply final aggressive cleaning specifically for storage + text_to_store = texts[idx] + + # Convert ALL newlines to spaces for storage (preserve readability but remove line breaks) + text_to_store = re.sub(r'\n+', ' ', text_to_store) + text_to_store = re.sub(r'\s+', ' ', text_to_store) # Normalize all whitespace + + # Final aggressive quote cleaning for storage + text_to_store = re.sub(r'\\+"', '"', text_to_store) # Multiple backslashes before quotes + text_to_store = re.sub(r'\\"', '"', text_to_store) # Any escaped quotes + text_to_store = re.sub(r"\\'", "'", text_to_store) # Any escaped single quotes + text_to_store = re.sub(r'\\&', '&', text_to_store) # Escaped ampersands + text_to_store = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text_to_store) # Any other escaped special chars + + text_to_store = text_to_store.strip() + + items.append({ "id": str(uuid.uuid4()), - "text": text, + "text": text_to_store, "vector": embeddings[idx], "metadata": metadatas[idx], - } - for idx, text in enumerate(texts) - ] + }) VECTOR_DB_CLIENT.insert( collection_name=collection_name, @@ -1127,7 +1162,7 @@ def process_file( docs = [ Document( - page_content=form_data.content.replace("
", "\n"), + page_content=clean_text_content(form_data.content.replace("
", "\n")), metadata={ **file.meta, "name": file.filename, @@ -1150,7 +1185,7 @@ def process_file( if result is not None and len(result.ids[0]) > 0: docs = [ Document( - page_content=result.documents[0][idx], + page_content=clean_text_content(result.documents[0][idx]), metadata=result.metadatas[0][idx], ) for idx, id in enumerate(result.ids[0]) @@ -1158,7 +1193,7 @@ def process_file( else: docs = [ Document( - page_content=file.data.get("content", ""), + page_content=clean_text_content(file.data.get("content", "")), metadata={ **file.meta, "name": file.filename, @@ -1194,9 +1229,13 @@ def process_file( file.filename, file.meta.get("content_type"), file_path ) - docs = [ - Document( - page_content=doc.page_content, + # Clean the loaded documents before processing + cleaned_docs = [] + for doc in docs: + cleaned_content = clean_text_content(doc.page_content) + + cleaned_docs.append(Document( + page_content=cleaned_content, metadata={ **doc.metadata, "name": file.filename, @@ -1204,13 +1243,12 @@ def process_file( "file_id": file.id, "source": file.filename, }, - ) - for doc in docs - ] + )) + docs = cleaned_docs else: docs = [ Document( - page_content=file.data.get("content", ""), + page_content=clean_text_content(file.data.get("content", "")), metadata={ **file.meta, "name": file.filename, @@ -1302,7 +1340,7 @@ def process_text( docs = [ Document( - page_content=form_data.content, + page_content=clean_text_content(form_data.content), metadata={"name": form_data.name, "created_by": user.id}, ) ] @@ -1955,6 +1993,7 @@ if ENV == "dev": } + class BatchProcessFilesForm(BaseModel): files: List[FileModel] collection_name: str @@ -1992,7 +2031,7 @@ def process_files_batch( docs: List[Document] = [ Document( - page_content=text_content.replace("
", "\n"), + page_content=clean_text_content(text_content.replace("
", "\n")), metadata={ **file.meta, "name": file.filename, @@ -2045,3 +2084,167 @@ def process_files_batch( ) return BatchProcessFilesResponse(results=results, errors=errors) + + +def clean_text_content(text: str) -> str: + """Simple, effective text cleaning with special handling for PPTX artifacts""" + if not text: + return text + + # Step 1: PPTX-specific cleaning - handle double-escaped sequences first + text = text.replace('\\\\n', '\n') # Double-escaped newlines in PPTX + text = text.replace('\\\\t', ' ') # Double-escaped tabs in PPTX + text = text.replace('\\\\"', '"') # Double-escaped quotes in PPTX + + # Step 2: Standard escape sequences + text = text.replace('\\n', '\n') # Single-escaped newlines + text = text.replace('\\t', ' ') # Single-escaped tabs to spaces + text = text.replace('\\"', '"') # Single-escaped quotes + text = text.replace('\\\'', "'") # Single-escaped single quotes + text = text.replace('\\r', '') # Remove escaped carriage returns + text = text.replace('\\/', '/') # Convert escaped slashes + text = text.replace('\\\\', '\\') # Convert double backslashes + + # Step 3: Remove any remaining backslash artifacts + text = re.sub(r'\\[a-zA-Z]', '', text) # Remove \letter patterns + text = re.sub(r'\\[0-9]', '', text) # Remove \number patterns + text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns + + # Step 4: PPTX-specific artifacts cleanup + text = re.sub(r'\s*\\n\s*', '\n', text) # Clean up any remaining \\n with spaces + text = re.sub(r'\\+', '', text) # Remove any remaining multiple backslashes + + # Step 5: Fix Unicode and special characters + unicode_replacements = [ + ('–', '-'), # En dash to hyphen + ('—', '-'), # Em dash to hyphen + (''', "'"), # Smart single quotes + (''', "'"), # Smart single quotes + ('"', '"'), # Smart double quotes + ('"', '"'), # Smart double quotes + ('…', '...'), # Ellipsis to three dots + ] + + for old_char, new_char in unicode_replacements: + if old_char in text: + text = text.replace(old_char, new_char) + + # Step 6: Clean up spacing and formatting + text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs -> single space + text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break + text = re.sub(r'^\s+|\s+$', '', text) # Remove leading/trailing whitespace + + # Step 7: Additional quote cleaning + text = re.sub(r'\\+"', '"', text) # Multiple backslashes before quotes + text = re.sub(r'\\"', '"', text) # Any remaining escaped quotes + text = re.sub(r"\\'", "'", text) # Any remaining escaped single quotes + + # Step 8: Fix orphaned punctuation + text = re.sub(r'^\s*[)\]}]+\s*', '', text) # Remove orphaned closing brackets/parens at start + text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text) # Remove orphaned closing brackets on their own lines + + return text + +def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]: + """Create semantically aware chunks that respect document structure""" + if not text or len(text) <= max_chunk_size: + return [text] if text else [] + + chunks = [] + + # Split by double line breaks (paragraphs) first + paragraphs = text.split('\n\n') + + current_chunk = "" + + for paragraph in paragraphs: + paragraph = paragraph.strip() + if not paragraph: + continue + + # If adding this paragraph would exceed chunk size + if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size: + # Try to split the current chunk at sentence boundaries if it's too long + if len(current_chunk) > max_chunk_size: + sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size) + chunks.extend(sentence_chunks) + else: + chunks.append(current_chunk.strip()) + + # Start new chunk with overlap from previous chunk if applicable + if chunks and overlap_size > 0: + prev_chunk = chunks[-1] + overlap_text = get_text_overlap(prev_chunk, overlap_size) + current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph + else: + current_chunk = paragraph + else: + # Add paragraph to current chunk + if current_chunk: + current_chunk += "\n\n" + paragraph + else: + current_chunk = paragraph + + # Add the last chunk + if current_chunk: + if len(current_chunk) > max_chunk_size: + sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size) + chunks.extend(sentence_chunks) + else: + chunks.append(current_chunk.strip()) + + return [chunk for chunk in chunks if chunk.strip()] + +def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]: + """Split text by sentences when paragraph-level splitting isn't sufficient""" + # Split by sentence endings + sentences = re.split(r'(?<=[.!?])\s+', text) + + chunks = [] + current_chunk = "" + + for sentence in sentences: + sentence = sentence.strip() + if not sentence: + continue + + # If adding this sentence would exceed chunk size + if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size: + chunks.append(current_chunk.strip()) + + # Start new chunk with overlap + if overlap_size > 0: + overlap_text = get_text_overlap(current_chunk, overlap_size) + current_chunk = overlap_text + " " + sentence if overlap_text else sentence + else: + current_chunk = sentence + else: + # Add sentence to current chunk + if current_chunk: + current_chunk += " " + sentence + else: + current_chunk = sentence + + # Add the last chunk + if current_chunk: + chunks.append(current_chunk.strip()) + + return [chunk for chunk in chunks if chunk.strip()] + +def get_text_overlap(text: str, overlap_size: int) -> str: + """Get the last overlap_size characters from text, preferring word boundaries""" + if not text or overlap_size <= 0: + return "" + + if len(text) <= overlap_size: + return text + + # Try to find a good word boundary within the overlap region + overlap_text = text[-overlap_size:] + + # Find the first space to avoid cutting words + space_index = overlap_text.find(' ') + if space_index > 0: + return overlap_text[space_index:].strip() + + return overlap_text.strip()