From 6168310ec7c82d1363b21dd1efa6ec9ed6edb796 Mon Sep 17 00:00:00 2001
From: PVBLIC Foundation <saas@pvblic.org>
Date: Tue, 27 May 2025 18:07:04 -0700
Subject: [PATCH] Update retrieval.py

Added semantic chunking for better document structure preservation
---
 backend/open_webui/routers/retrieval.py | 285 ++++++++++++++++++++----
 1 file changed, 244 insertions(+), 41 deletions(-)

diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index 98f79c7fe..947b7ed49 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -4,7 +4,8 @@ import mimetypes
 import os
 import shutil
 import asyncio
-
+import re
+from typing import List as TypingList
 
 import uuid
 from datetime import datetime
@@ -984,28 +985,37 @@ def save_docs_to_vector_db(
                 raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
 
     if split:
-        if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=request.app.state.config.CHUNK_SIZE,
-                chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
-                add_start_index=True,
+        # Apply advanced content-aware splitting and text cleaning
+        processed_docs = []
+        
+        for doc in docs:
+            # Clean the text content before chunking
+            if not doc.page_content:
+                continue
+            
+            # Apply text cleaning before chunking
+            cleaned_content = clean_text_content(doc.page_content)
+            
+            # Create semantic chunks from cleaned content
+            chunks = create_semantic_chunks(
+                cleaned_content,
+                request.app.state.config.CHUNK_SIZE,
+                request.app.state.config.CHUNK_OVERLAP
             )
-        elif request.app.state.config.TEXT_SPLITTER == "token":
-            log.info(
-                f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
-            )
-
-            tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
-            text_splitter = TokenTextSplitter(
-                encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME),
-                chunk_size=request.app.state.config.CHUNK_SIZE,
-                chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
-                add_start_index=True,
-            )
-        else:
-            raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
-
-        docs = text_splitter.split_documents(docs)
+            
+            # Create new documents for each chunk
+            for i, chunk in enumerate(chunks):
+                chunk_metadata = {
+                    **doc.metadata,
+                    "chunk_index": i,
+                    "total_chunks": len(chunks)
+                }
+                processed_docs.append(Document(
+                    page_content=chunk,
+                    metadata=chunk_metadata
+                ))
+        
+        docs = processed_docs
 
     if len(docs) == 0:
         raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
@@ -1067,21 +1077,46 @@ def save_docs_to_vector_db(
             request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
         )
 
+        # Apply final text cleaning for embedding (text already cleaned during chunking)
+        cleaned_texts = []
+        for i, text in enumerate(texts):
+            # Text is already cleaned, just flatten for embedding (convert line breaks to spaces)
+            cleaned_text = re.sub(r'\n+', ' ', text)
+            cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Final whitespace normalization
+            cleaned_text = cleaned_text.strip()
+            cleaned_texts.append(cleaned_text)
+        
         embeddings = embedding_function(
-            list(map(lambda x: x.replace("\n", " "), texts)),
+            cleaned_texts,
             prefix=RAG_EMBEDDING_CONTENT_PREFIX,
             user=user,
         )
 
-        items = [
-            {
+        # Store the fully cleaned text - apply final aggressive cleaning for storage
+        items = []
+        for idx in range(len(texts)):
+            # Apply final aggressive cleaning specifically for storage
+            text_to_store = texts[idx]
+            
+            # Convert ALL newlines to spaces for storage (preserve readability but remove line breaks)
+            text_to_store = re.sub(r'\n+', ' ', text_to_store)
+            text_to_store = re.sub(r'\s+', ' ', text_to_store)  # Normalize all whitespace
+            
+            # Final aggressive quote cleaning for storage
+            text_to_store = re.sub(r'\\+"', '"', text_to_store)     # Multiple backslashes before quotes
+            text_to_store = re.sub(r'\\"', '"', text_to_store)      # Any escaped quotes
+            text_to_store = re.sub(r"\\'", "'", text_to_store)      # Any escaped single quotes
+            text_to_store = re.sub(r'\\&', '&', text_to_store)      # Escaped ampersands
+            text_to_store = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text_to_store)  # Any other escaped special chars
+            
+            text_to_store = text_to_store.strip()
+            
+            items.append({
                 "id": str(uuid.uuid4()),
-                "text": text,
+                "text": text_to_store,
                 "vector": embeddings[idx],
                 "metadata": metadatas[idx],
-            }
-            for idx, text in enumerate(texts)
-        ]
+            })
 
         VECTOR_DB_CLIENT.insert(
             collection_name=collection_name,
@@ -1127,7 +1162,7 @@ def process_file(
 
             docs = [
                 Document(
-                    page_content=form_data.content.replace("<br/>", "\n"),
+                    page_content=clean_text_content(form_data.content.replace("<br/>", "\n")),
                     metadata={
                         **file.meta,
                         "name": file.filename,
@@ -1150,7 +1185,7 @@ def process_file(
             if result is not None and len(result.ids[0]) > 0:
                 docs = [
                     Document(
-                        page_content=result.documents[0][idx],
+                        page_content=clean_text_content(result.documents[0][idx]),
                         metadata=result.metadatas[0][idx],
                     )
                     for idx, id in enumerate(result.ids[0])
@@ -1158,7 +1193,7 @@ def process_file(
             else:
                 docs = [
                     Document(
-                        page_content=file.data.get("content", ""),
+                        page_content=clean_text_content(file.data.get("content", "")),
                         metadata={
                             **file.meta,
                             "name": file.filename,
@@ -1194,9 +1229,13 @@ def process_file(
                     file.filename, file.meta.get("content_type"), file_path
                 )
 
-                docs = [
-                    Document(
-                        page_content=doc.page_content,
+                # Clean the loaded documents before processing
+                cleaned_docs = []
+                for doc in docs:
+                    cleaned_content = clean_text_content(doc.page_content)
+                    
+                    cleaned_docs.append(Document(
+                        page_content=cleaned_content,
                         metadata={
                             **doc.metadata,
                             "name": file.filename,
@@ -1204,13 +1243,12 @@ def process_file(
                             "file_id": file.id,
                             "source": file.filename,
                         },
-                    )
-                    for doc in docs
-                ]
+                    ))
+                docs = cleaned_docs
             else:
                 docs = [
                     Document(
-                        page_content=file.data.get("content", ""),
+                        page_content=clean_text_content(file.data.get("content", "")),
                         metadata={
                             **file.meta,
                             "name": file.filename,
@@ -1302,7 +1340,7 @@ def process_text(
 
     docs = [
         Document(
-            page_content=form_data.content,
+            page_content=clean_text_content(form_data.content),
             metadata={"name": form_data.name, "created_by": user.id},
         )
     ]
@@ -1955,6 +1993,7 @@ if ENV == "dev":
         }
 
 
+
 class BatchProcessFilesForm(BaseModel):
     files: List[FileModel]
     collection_name: str
@@ -1992,7 +2031,7 @@ def process_files_batch(
 
             docs: List[Document] = [
                 Document(
-                    page_content=text_content.replace("<br/>", "\n"),
+                    page_content=clean_text_content(text_content.replace("<br/>", "\n")),
                     metadata={
                         **file.meta,
                         "name": file.filename,
@@ -2045,3 +2084,167 @@ def process_files_batch(
                 )
 
     return BatchProcessFilesResponse(results=results, errors=errors)
+
+
+def clean_text_content(text: str) -> str:
+    """Simple, effective text cleaning with special handling for PPTX artifacts"""
+    if not text:
+        return text
+    
+    # Step 1: PPTX-specific cleaning - handle double-escaped sequences first
+    text = text.replace('\\\\n', '\n')  # Double-escaped newlines in PPTX
+    text = text.replace('\\\\t', ' ')   # Double-escaped tabs in PPTX
+    text = text.replace('\\\\"', '"')   # Double-escaped quotes in PPTX
+    
+    # Step 2: Standard escape sequences
+    text = text.replace('\\n', '\n')    # Single-escaped newlines
+    text = text.replace('\\t', ' ')     # Single-escaped tabs to spaces
+    text = text.replace('\\"', '"')     # Single-escaped quotes
+    text = text.replace('\\\'', "'")    # Single-escaped single quotes
+    text = text.replace('\\r', '')      # Remove escaped carriage returns
+    text = text.replace('\\/', '/')     # Convert escaped slashes
+    text = text.replace('\\\\', '\\')   # Convert double backslashes
+    
+    # Step 3: Remove any remaining backslash artifacts
+    text = re.sub(r'\\[a-zA-Z]', '', text)  # Remove \letter patterns
+    text = re.sub(r'\\[0-9]', '', text)     # Remove \number patterns
+    text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text)  # Remove \symbol patterns
+    
+    # Step 4: PPTX-specific artifacts cleanup
+    text = re.sub(r'\s*\\n\s*', '\n', text)  # Clean up any remaining \\n with spaces
+    text = re.sub(r'\\+', '', text)          # Remove any remaining multiple backslashes
+    
+    # Step 5: Fix Unicode and special characters
+    unicode_replacements = [
+        ('–', '-'),        # En dash to hyphen
+        ('—', '-'),        # Em dash to hyphen  
+        (''', "'"),        # Smart single quotes
+        (''', "'"),        # Smart single quotes
+        ('"', '"'),        # Smart double quotes
+        ('"', '"'),        # Smart double quotes
+        ('…', '...'),      # Ellipsis to three dots
+    ]
+    
+    for old_char, new_char in unicode_replacements:
+        if old_char in text:
+            text = text.replace(old_char, new_char)
+    
+    # Step 6: Clean up spacing and formatting
+    text = re.sub(r'[ \t]+', ' ', text)           # Multiple spaces/tabs -> single space
+    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break
+    text = re.sub(r'^\s+|\s+$', '', text)         # Remove leading/trailing whitespace
+    
+    # Step 7: Additional quote cleaning
+    text = re.sub(r'\\+"', '"', text)     # Multiple backslashes before quotes
+    text = re.sub(r'\\"', '"', text)      # Any remaining escaped quotes
+    text = re.sub(r"\\'", "'", text)      # Any remaining escaped single quotes
+    
+    # Step 8: Fix orphaned punctuation
+    text = re.sub(r'^\s*[)\]}]+\s*', '', text)    # Remove orphaned closing brackets/parens at start
+    text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text)  # Remove orphaned closing brackets on their own lines
+    
+    return text
+
+def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+    """Create semantically aware chunks that respect document structure"""
+    if not text or len(text) <= max_chunk_size:
+        return [text] if text else []
+    
+    chunks = []
+    
+    # Split by double line breaks (paragraphs) first
+    paragraphs = text.split('\n\n')
+    
+    current_chunk = ""
+    
+    for paragraph in paragraphs:
+        paragraph = paragraph.strip()
+        if not paragraph:
+            continue
+            
+        # If adding this paragraph would exceed chunk size
+        if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
+            # Try to split the current chunk at sentence boundaries if it's too long
+            if len(current_chunk) > max_chunk_size:
+                sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+                chunks.extend(sentence_chunks)
+            else:
+                chunks.append(current_chunk.strip())
+            
+            # Start new chunk with overlap from previous chunk if applicable
+            if chunks and overlap_size > 0:
+                prev_chunk = chunks[-1]
+                overlap_text = get_text_overlap(prev_chunk, overlap_size)
+                current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+            else:
+                current_chunk = paragraph
+        else:
+            # Add paragraph to current chunk
+            if current_chunk:
+                current_chunk += "\n\n" + paragraph
+            else:
+                current_chunk = paragraph
+    
+    # Add the last chunk
+    if current_chunk:
+        if len(current_chunk) > max_chunk_size:
+            sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+            chunks.extend(sentence_chunks)
+        else:
+            chunks.append(current_chunk.strip())
+    
+    return [chunk for chunk in chunks if chunk.strip()]
+
+def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+    """Split text by sentences when paragraph-level splitting isn't sufficient"""
+    # Split by sentence endings
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    
+    chunks = []
+    current_chunk = ""
+    
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+            
+        # If adding this sentence would exceed chunk size
+        if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size:
+            chunks.append(current_chunk.strip())
+            
+            # Start new chunk with overlap
+            if overlap_size > 0:
+                overlap_text = get_text_overlap(current_chunk, overlap_size)
+                current_chunk = overlap_text + " " + sentence if overlap_text else sentence
+            else:
+                current_chunk = sentence
+        else:
+            # Add sentence to current chunk
+            if current_chunk:
+                current_chunk += " " + sentence
+            else:
+                current_chunk = sentence
+    
+    # Add the last chunk
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    
+    return [chunk for chunk in chunks if chunk.strip()]
+
+def get_text_overlap(text: str, overlap_size: int) -> str:
+    """Get the last overlap_size characters from text, preferring word boundaries"""
+    if not text or overlap_size <= 0:
+        return ""
+    
+    if len(text) <= overlap_size:
+        return text
+    
+    # Try to find a good word boundary within the overlap region
+    overlap_text = text[-overlap_size:]
+    
+    # Find the first space to avoid cutting words
+    space_index = overlap_text.find(' ')
+    if space_index > 0:
+        return overlap_text[space_index:].strip()
+    
+    return overlap_text.strip()