diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index 343b0513c..caabddd03 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -4,6 +4,8 @@ import mimetypes
import os
import shutil
import asyncio
+import re
+from typing import List as TypingList
import uuid
@@ -188,6 +190,297 @@ def get_rf(
return rf
+##########################################
+#
+# Text cleaning and processing functions
+#
+##########################################
+
+
+class TextCleaner:
+ """Modular text cleaning system for document processing and embedding preparation."""
+
+ @staticmethod
+ def normalize_escape_sequences(text: str) -> str:
+ """Normalize escape sequences from various document formats."""
+ if not text:
+ return ""
+
+ # Handle double-escaped sequences (common in PPTX)
+ replacements = [
+ ('\\\\n', '\n'), # Double-escaped newlines
+ ('\\\\t', ' '), # Double-escaped tabs
+ ('\\\\"', '"'), # Double-escaped quotes
+ ('\\\\r', ''), # Double-escaped carriage returns
+ ('\\\\/', '/'), # Double-escaped slashes
+ ('\\\\', '\\'), # Convert double backslashes to single
+ ]
+
+ for old, new in replacements:
+ text = text.replace(old, new)
+
+ # Handle single-escaped sequences
+ single_replacements = [
+ ('\\n', '\n'), # Single-escaped newlines
+ ('\\t', ' '), # Single-escaped tabs
+ ('\\"', '"'), # Single-escaped quotes
+ ('\\\'', "'"), # Single-escaped single quotes
+ ('\\r', ''), # Single-escaped carriage returns
+ ('\\/', '/'), # Single-escaped slashes
+ ]
+
+ for old, new in single_replacements:
+ text = text.replace(old, new)
+
+ # Remove any remaining backslash artifacts
+ text = re.sub(r'\\[a-zA-Z]', '', text) # Remove \letter patterns
+ text = re.sub(r'\\[0-9]', '', text) # Remove \number patterns
+ text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns
+ text = re.sub(r'\\+', '', text) # Remove remaining backslashes
+
+ return text
+
+ @staticmethod
+ def normalize_unicode(text: str) -> str:
+ """Convert special Unicode characters to ASCII equivalents."""
+ if not text:
+ return ""
+
+ unicode_map = {
+ '–': '-', # En dash
+ '—': '-', # Em dash
+ ''': "'", # Smart single quote left
+ ''': "'", # Smart single quote right
+ '"': '"', # Smart double quote left
+ '"': '"', # Smart double quote right
+ '…': '...', # Ellipsis
+ '™': ' TM', # Trademark
+ '®': ' R', # Registered
+ '©': ' C', # Copyright
+ '°': ' deg', # Degree symbol
+ }
+
+ for unicode_char, ascii_char in unicode_map.items():
+ text = text.replace(unicode_char, ascii_char)
+
+ return text
+
+ @staticmethod
+ def normalize_quotes(text: str) -> str:
+ """Clean up quote-related artifacts and normalize quote marks."""
+ if not text:
+ return ""
+
+ # Remove quote artifacts
+ quote_patterns = [
+ (r'\\+"', '"'), # Multiple backslashes before quotes
+ (r'\\"', '"'), # Escaped double quotes
+ (r"\\'", "'"), # Escaped single quotes
+ (r'\\&', '&'), # Escaped ampersands
+ (r'""', '"'), # Double quotes
+ (r"''", "'"), # Double single quotes
+ ]
+
+ for pattern, replacement in quote_patterns:
+ text = re.sub(pattern, replacement, text)
+
+ return text
+
+ @staticmethod
+ def normalize_whitespace(text: str, preserve_paragraphs: bool = True) -> str:
+ """Normalize whitespace while optionally preserving paragraph structure."""
+ if not text:
+ return ""
+
+ if preserve_paragraphs:
+ # Preserve paragraph breaks (double newlines) but clean up excessive spacing
+ text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs -> single space
+ text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break
+ text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE) # Trim line-level whitespace
+ else:
+ # Flatten all whitespace for embedding
+ text = re.sub(r'\n+', ' ', text) # All newlines to spaces
+ text = re.sub(r'\s+', ' ', text) # All whitespace to single spaces
+
+ return text.strip()
+
+ @staticmethod
+ def remove_artifacts(text: str) -> str:
+ """Remove document format artifacts and orphaned elements."""
+ if not text:
+ return ""
+
+ # Remove orphaned punctuation
+ text = re.sub(r'^\s*[)\]}]+\s*', '', text) # Orphaned closing brackets at start
+ text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text) # Orphaned closing brackets on own lines
+
+ # Remove excessive punctuation
+ text = re.sub(r'[.]{3,}', '...', text) # Multiple dots to ellipsis
+ text = re.sub(r'[-]{3,}', '---', text) # Multiple dashes
+
+ # Remove empty parentheses and brackets
+ text = re.sub(r'\(\s*\)', '', text) # Empty parentheses
+ text = re.sub(r'\[\s*\]', '', text) # Empty square brackets
+ text = re.sub(r'\{\s*\}', '', text) # Empty curly brackets
+
+ return text
+
+ @classmethod
+ def clean_for_chunking(cls, text: str) -> str:
+ """Clean text for semantic chunking - preserves structure but normalizes content."""
+ if not text:
+ return ""
+
+ # Apply all cleaning steps while preserving paragraph structure
+ text = cls.normalize_escape_sequences(text)
+ text = cls.normalize_unicode(text)
+ text = cls.normalize_quotes(text)
+ text = cls.remove_artifacts(text)
+ text = cls.normalize_whitespace(text, preserve_paragraphs=True)
+
+ return text
+
+ @classmethod
+ def clean_for_embedding(cls, text: str) -> str:
+ """Clean text for embedding - flattens structure and optimizes for vector similarity."""
+ if not text:
+ return ""
+
+ # Start with chunking-level cleaning
+ text = cls.clean_for_chunking(text)
+
+ # Flatten for embedding
+ text = cls.normalize_whitespace(text, preserve_paragraphs=False)
+
+ return text
+
+ @classmethod
+ def clean_for_storage(cls, text: str) -> str:
+ """Clean text for storage - most aggressive cleaning for database storage."""
+ if not text:
+ return ""
+
+ # Start with embedding-level cleaning
+ text = cls.clean_for_embedding(text)
+
+ # Additional aggressive cleaning for storage
+ text = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text) # Remove any remaining escape sequences
+
+ return text
+
+
+def clean_text_content(text: str) -> str:
+ """Legacy function wrapper for backward compatibility."""
+ return TextCleaner.clean_for_chunking(text)
+
+
+def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+ """Create semantically aware chunks that respect document structure"""
+ if not text or len(text) <= max_chunk_size:
+ return [text] if text else []
+
+ chunks = []
+
+ # Split by double line breaks (paragraphs) first
+ paragraphs = text.split('\n\n')
+
+ current_chunk = ""
+
+ for paragraph in paragraphs:
+ paragraph = paragraph.strip()
+ if not paragraph:
+ continue
+
+ # If adding this paragraph would exceed chunk size
+ if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
+ # Try to split the current chunk at sentence boundaries if it's too long
+ if len(current_chunk) > max_chunk_size:
+ sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+ chunks.extend(sentence_chunks)
+ else:
+ chunks.append(current_chunk.strip())
+
+ # Start new chunk with overlap from previous chunk if applicable
+ if chunks and overlap_size > 0:
+ prev_chunk = chunks[-1]
+ overlap_text = get_text_overlap(prev_chunk, overlap_size)
+ current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+ else:
+ current_chunk = paragraph
+ else:
+ # Add paragraph to current chunk
+ if current_chunk:
+ current_chunk += "\n\n" + paragraph
+ else:
+ current_chunk = paragraph
+
+ # Add the last chunk
+ if current_chunk:
+ if len(current_chunk) > max_chunk_size:
+ sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+ chunks.extend(sentence_chunks)
+ else:
+ chunks.append(current_chunk.strip())
+
+ return [chunk for chunk in chunks if chunk.strip()]
+
+
+def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+ """Split text by sentences when paragraph-level splitting isn't sufficient"""
+ # Split by sentence endings
+ sentences = re.split(r'(?<=[.!?])\s+', text)
+
+ chunks = []
+ current_chunk = ""
+
+ for sentence in sentences:
+ sentence = sentence.strip()
+ if not sentence:
+ continue
+
+ # If adding this sentence would exceed chunk size
+ if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size:
+ chunks.append(current_chunk.strip())
+
+ # Start new chunk with overlap
+ if overlap_size > 0:
+ overlap_text = get_text_overlap(current_chunk, overlap_size)
+ current_chunk = overlap_text + " " + sentence if overlap_text else sentence
+ else:
+ current_chunk = sentence
+ else:
+ # Add sentence to current chunk
+ if current_chunk:
+ current_chunk += " " + sentence
+ else:
+ current_chunk = sentence
+
+ # Add the last chunk
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+
+ return [chunk for chunk in chunks if chunk.strip()]
+
+
+def get_text_overlap(text: str, overlap_size: int) -> str:
+ """Get the last overlap_size characters from text, preferring word boundaries"""
+ if not text or overlap_size <= 0:
+ return ""
+
+ if len(text) <= overlap_size:
+ return text
+
+ # Try to find a good word boundary within the overlap region
+ overlap_text = text[-overlap_size:]
+
+ # Find the first space to avoid cutting words
+ space_index = overlap_text.find(' ')
+ if space_index > 0:
+ return overlap_text[space_index:].strip()
+
+ return overlap_text.strip()
+
+
##########################################
#
# API routes
@@ -1101,28 +1394,37 @@ def save_docs_to_vector_db(
raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
if split:
- if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
- text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=request.app.state.config.CHUNK_SIZE,
- chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
- add_start_index=True,
+ # Apply advanced content-aware splitting and text cleaning
+ processed_docs = []
+
+ for doc in docs:
+ # Clean the text content before chunking
+ if not doc.page_content:
+ continue
+
+ # Apply text cleaning before chunking using new modular system
+ cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
+
+ # Create semantic chunks from cleaned content
+ chunks = create_semantic_chunks(
+ cleaned_content,
+ request.app.state.config.CHUNK_SIZE,
+ request.app.state.config.CHUNK_OVERLAP
)
- elif request.app.state.config.TEXT_SPLITTER == "token":
- log.info(
- f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
- )
-
- tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
- text_splitter = TokenTextSplitter(
- encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME),
- chunk_size=request.app.state.config.CHUNK_SIZE,
- chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
- add_start_index=True,
- )
- else:
- raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
-
- docs = text_splitter.split_documents(docs)
+
+ # Create new documents for each chunk
+ for i, chunk in enumerate(chunks):
+ chunk_metadata = {
+ **doc.metadata,
+ "chunk_index": i,
+ "total_chunks": len(chunks)
+ }
+ processed_docs.append(Document(
+ page_content=chunk,
+ metadata=chunk_metadata
+ ))
+
+ docs = processed_docs
if len(docs) == 0:
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
@@ -1197,21 +1499,27 @@ def save_docs_to_vector_db(
),
)
+ # Prepare texts for embedding using the new modular cleaning system
+ cleaned_texts = [TextCleaner.clean_for_embedding(text) for text in texts]
+
embeddings = embedding_function(
- list(map(lambda x: x.replace("\n", " "), texts)),
+ cleaned_texts,
prefix=RAG_EMBEDDING_CONTENT_PREFIX,
user=user,
)
- items = [
- {
+ # Store the cleaned text using the new modular cleaning system
+ items = []
+ for idx in range(len(texts)):
+ # Apply consistent storage-level cleaning
+ text_to_store = TextCleaner.clean_for_storage(texts[idx])
+
+ items.append({
"id": str(uuid.uuid4()),
- "text": text,
+ "text": text_to_store,
"vector": embeddings[idx],
"metadata": metadatas[idx],
- }
- for idx, text in enumerate(texts)
- ]
+ })
VECTOR_DB_CLIENT.insert(
collection_name=collection_name,
@@ -1257,7 +1565,7 @@ def process_file(
docs = [
Document(
- page_content=form_data.content.replace("
", "\n"),
+ page_content=TextCleaner.clean_for_chunking(form_data.content.replace("
", "\n")),
metadata={
**file.meta,
"name": file.filename,
@@ -1280,7 +1588,7 @@ def process_file(
if result is not None and len(result.ids[0]) > 0:
docs = [
Document(
- page_content=result.documents[0][idx],
+ page_content=TextCleaner.clean_for_chunking(result.documents[0][idx]),
metadata=result.metadatas[0][idx],
)
for idx, id in enumerate(result.ids[0])
@@ -1288,7 +1596,7 @@ def process_file(
else:
docs = [
Document(
- page_content=file.data.get("content", ""),
+ page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
metadata={
**file.meta,
"name": file.filename,
@@ -1333,9 +1641,13 @@ def process_file(
file.filename, file.meta.get("content_type"), file_path
)
- docs = [
- Document(
- page_content=doc.page_content,
+ # Clean the loaded documents before processing
+ cleaned_docs = []
+ for doc in docs:
+ cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
+
+ cleaned_docs.append(Document(
+ page_content=cleaned_content,
metadata={
**doc.metadata,
"name": file.filename,
@@ -1343,13 +1655,12 @@ def process_file(
"file_id": file.id,
"source": file.filename,
},
- )
- for doc in docs
- ]
+ ))
+ docs = cleaned_docs
else:
docs = [
Document(
- page_content=file.data.get("content", ""),
+ page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
metadata={
**file.meta,
"name": file.filename,
@@ -1359,7 +1670,11 @@ def process_file(
},
)
]
- text_content = " ".join([doc.page_content for doc in docs])
+ text_content = " ".join([doc.page_content for doc in docs if doc.page_content])
+
+ # Ensure text_content is never None or empty for hash calculation
+ if not text_content:
+ text_content = ""
log.debug(f"text_content: {text_content}")
Files.update_file_data_by_id(
@@ -1367,7 +1682,9 @@ def process_file(
{"content": text_content},
)
- hash = calculate_sha256_string(text_content)
+ # Ensure we always pass a valid string to calculate_sha256_string
+ hash_input = text_content if text_content else ""
+ hash = calculate_sha256_string(hash_input)
Files.update_file_hash_by_id(file.id, hash)
if not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
@@ -1441,7 +1758,7 @@ def process_text(
docs = [
Document(
- page_content=form_data.content,
+ page_content=TextCleaner.clean_for_chunking(form_data.content),
metadata={"name": form_data.name, "created_by": user.id},
)
]
@@ -2132,7 +2449,7 @@ def process_files_batch(
docs: List[Document] = [
Document(
- page_content=text_content.replace("
", "\n"),
+ page_content=TextCleaner.clean_for_chunking(text_content.replace("
", "\n")),
metadata={
**file.meta,
"name": file.filename,
@@ -2143,7 +2460,7 @@ def process_files_batch(
)
]
- hash = calculate_sha256_string(text_content)
+ hash = calculate_sha256_string(text_content or "")
Files.update_file_hash_by_id(file.id, hash)
Files.update_file_data_by_id(file.id, {"content": text_content})
@@ -2185,3 +2502,100 @@ def process_files_batch(
)
return BatchProcessFilesResponse(results=results, errors=errors)
+
+
+def delete_file_from_vector_db(file_id: str) -> bool:
+ """
+ Delete all vector embeddings for a specific file from the vector database.
+ This function works with any vector database (Pinecone, ChromaDB, etc.) and
+ handles the cleanup when a file is deleted from the chat.
+
+ Args:
+ file_id (str): The ID of the file to delete from vector database
+
+ Returns:
+ bool: True if deletion was successful, False otherwise
+ """
+ try:
+ # Get the file record to access its hash and collection info
+ file = Files.get_file_by_id(file_id)
+ if not file:
+ return False
+
+ # Get the file hash for vector deletion
+ file_hash = file.hash
+ if not file_hash:
+ return False
+
+ # Try to get collection name from file metadata
+ collection_name = None
+ if hasattr(file, 'meta') and file.meta:
+ collection_name = file.meta.get('collection_name')
+
+ # If no collection name in metadata, try common patterns used by Open WebUI
+ if not collection_name:
+ # Open WebUI typically uses these patterns:
+ possible_collections = [
+ f"open-webui_file-{file_id}", # Most common pattern
+ f"file-{file_id}", # Alternative pattern
+ f"open-webui_{file_id}", # Another possible pattern
+ ]
+
+ # Try each possible collection name
+ for possible_collection in possible_collections:
+ try:
+ if VECTOR_DB_CLIENT.has_collection(collection_name=possible_collection):
+ result = VECTOR_DB_CLIENT.delete(
+ collection_name=possible_collection,
+ filter={"hash": file_hash},
+ )
+ # Pinecone returns None on successful deletion
+ return True
+ except Exception as e:
+ continue
+
+ # If none of the standard patterns work, try searching through all collections
+ try:
+ deleted_count = 0
+
+ # Get all collections (this method varies by vector DB implementation)
+ if hasattr(VECTOR_DB_CLIENT, 'list_collections'):
+ try:
+ collections = VECTOR_DB_CLIENT.list_collections()
+
+ for collection in collections:
+ try:
+ if VECTOR_DB_CLIENT.has_collection(collection_name=collection):
+ result = VECTOR_DB_CLIENT.delete(
+ collection_name=collection,
+ filter={"hash": file_hash},
+ )
+ # Pinecone returns None on successful deletion, so any non-exception means success
+ deleted_count += 1
+ except Exception as e:
+ continue
+ except Exception as e:
+ pass
+
+ return deleted_count > 0
+
+ except Exception as e:
+ return False
+
+ # Delete from the specific collection found in metadata
+ if collection_name and VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
+ try:
+ result = VECTOR_DB_CLIENT.delete(
+ collection_name=collection_name,
+ filter={"hash": file_hash},
+ )
+ # Pinecone returns None on successful deletion, so we check for no exception
+ # rather than checking the return value
+ return True
+ except Exception as e:
+ return False
+ else:
+ return False
+
+ except Exception as e:
+ return False