diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index caabddd03..b88677bb2 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -199,173 +199,181 @@ def get_rf(
class TextCleaner:
"""Modular text cleaning system for document processing and embedding preparation."""
-
+
@staticmethod
def normalize_escape_sequences(text: str) -> str:
"""Normalize escape sequences from various document formats."""
if not text:
return ""
-
+
# Handle double-escaped sequences (common in PPTX)
replacements = [
- ('\\\\n', '\n'), # Double-escaped newlines
- ('\\\\t', ' '), # Double-escaped tabs
- ('\\\\"', '"'), # Double-escaped quotes
- ('\\\\r', ''), # Double-escaped carriage returns
- ('\\\\/', '/'), # Double-escaped slashes
- ('\\\\', '\\'), # Convert double backslashes to single
+ ("\\\\n", "\n"), # Double-escaped newlines
+ ("\\\\t", " "), # Double-escaped tabs
+ ('\\\\"', '"'), # Double-escaped quotes
+ ("\\\\r", ""), # Double-escaped carriage returns
+ ("\\\\/", "/"), # Double-escaped slashes
+ ("\\\\", "\\"), # Convert double backslashes to single
]
-
+
for old, new in replacements:
text = text.replace(old, new)
-
+
# Handle single-escaped sequences
single_replacements = [
- ('\\n', '\n'), # Single-escaped newlines
- ('\\t', ' '), # Single-escaped tabs
- ('\\"', '"'), # Single-escaped quotes
- ('\\\'', "'"), # Single-escaped single quotes
- ('\\r', ''), # Single-escaped carriage returns
- ('\\/', '/'), # Single-escaped slashes
+ ("\\n", "\n"), # Single-escaped newlines
+ ("\\t", " "), # Single-escaped tabs
+ ('\\"', '"'), # Single-escaped quotes
+ ("\\'", "'"), # Single-escaped single quotes
+ ("\\r", ""), # Single-escaped carriage returns
+ ("\\/", "/"), # Single-escaped slashes
]
-
+
for old, new in single_replacements:
text = text.replace(old, new)
-
+
# Remove any remaining backslash artifacts
- text = re.sub(r'\\[a-zA-Z]', '', text) # Remove \letter patterns
- text = re.sub(r'\\[0-9]', '', text) # Remove \number patterns
- text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns
- text = re.sub(r'\\+', '', text) # Remove remaining backslashes
-
+ text = re.sub(r"\\[a-zA-Z]", "", text) # Remove \letter patterns
+ text = re.sub(r"\\[0-9]", "", text) # Remove \number patterns
+ text = re.sub(r"\\[^a-zA-Z0-9\s]", "", text) # Remove \symbol patterns
+ text = re.sub(r"\\+", "", text) # Remove remaining backslashes
+
return text
-
+
@staticmethod
def normalize_unicode(text: str) -> str:
"""Convert special Unicode characters to ASCII equivalents."""
if not text:
return ""
-
+
unicode_map = {
- '–': '-', # En dash
- '—': '-', # Em dash
- ''': "'", # Smart single quote left
- ''': "'", # Smart single quote right
- '"': '"', # Smart double quote left
- '"': '"', # Smart double quote right
- '…': '...', # Ellipsis
- '™': ' TM', # Trademark
- '®': ' R', # Registered
- '©': ' C', # Copyright
- '°': ' deg', # Degree symbol
+ "–": "-", # En dash
+ "—": "-", # Em dash
+ """: "'", # Smart single quote left
+ """: "'", # Smart single quote right
+ '"': '"', # Smart double quote left
+ '"': '"', # Smart double quote right
+ "…": "...", # Ellipsis
+ "™": " TM", # Trademark
+ "®": " R", # Registered
+ "©": " C", # Copyright
+ "°": " deg", # Degree symbol
}
-
+
for unicode_char, ascii_char in unicode_map.items():
text = text.replace(unicode_char, ascii_char)
-
+
return text
-
+
@staticmethod
def normalize_quotes(text: str) -> str:
"""Clean up quote-related artifacts and normalize quote marks."""
if not text:
return ""
-
+
# Remove quote artifacts
quote_patterns = [
- (r'\\+"', '"'), # Multiple backslashes before quotes
- (r'\\"', '"'), # Escaped double quotes
- (r"\\'", "'"), # Escaped single quotes
- (r'\\&', '&'), # Escaped ampersands
- (r'""', '"'), # Double quotes
- (r"''", "'"), # Double single quotes
+ (r'\\+"', '"'), # Multiple backslashes before quotes
+ (r'\\"', '"'), # Escaped double quotes
+ (r"\\'", "'"), # Escaped single quotes
+ (r"\\&", "&"), # Escaped ampersands
+ (r'""', '"'), # Double quotes
+ (r"''", "'"), # Double single quotes
]
-
+
for pattern, replacement in quote_patterns:
text = re.sub(pattern, replacement, text)
-
+
return text
-
+
@staticmethod
def normalize_whitespace(text: str, preserve_paragraphs: bool = True) -> str:
"""Normalize whitespace while optionally preserving paragraph structure."""
if not text:
return ""
-
+
if preserve_paragraphs:
# Preserve paragraph breaks (double newlines) but clean up excessive spacing
- text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs -> single space
- text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break
- text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE) # Trim line-level whitespace
+ text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs -> single space
+ text = re.sub(
+ r"\n\s*\n\s*\n+", "\n\n", text
+ ) # Multiple empty lines -> double line break
+ text = re.sub(
+ r"^\s+|\s+$", "", text, flags=re.MULTILINE
+ ) # Trim line-level whitespace
else:
# Flatten all whitespace for embedding
- text = re.sub(r'\n+', ' ', text) # All newlines to spaces
- text = re.sub(r'\s+', ' ', text) # All whitespace to single spaces
-
+ text = re.sub(r"\n+", " ", text) # All newlines to spaces
+ text = re.sub(r"\s+", " ", text) # All whitespace to single spaces
+
return text.strip()
-
+
@staticmethod
def remove_artifacts(text: str) -> str:
"""Remove document format artifacts and orphaned elements."""
if not text:
return ""
-
+
# Remove orphaned punctuation
- text = re.sub(r'^\s*[)\]}]+\s*', '', text) # Orphaned closing brackets at start
- text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text) # Orphaned closing brackets on own lines
-
+ text = re.sub(r"^\s*[)\]}]+\s*", "", text) # Orphaned closing brackets at start
+ text = re.sub(
+ r"\n\s*[)\]}]+\s*\n", "\n\n", text
+ ) # Orphaned closing brackets on own lines
+
# Remove excessive punctuation
- text = re.sub(r'[.]{3,}', '...', text) # Multiple dots to ellipsis
- text = re.sub(r'[-]{3,}', '---', text) # Multiple dashes
-
+ text = re.sub(r"[.]{3,}", "...", text) # Multiple dots to ellipsis
+ text = re.sub(r"[-]{3,}", "---", text) # Multiple dashes
+
# Remove empty parentheses and brackets
- text = re.sub(r'\(\s*\)', '', text) # Empty parentheses
- text = re.sub(r'\[\s*\]', '', text) # Empty square brackets
- text = re.sub(r'\{\s*\}', '', text) # Empty curly brackets
-
+ text = re.sub(r"\(\s*\)", "", text) # Empty parentheses
+ text = re.sub(r"\[\s*\]", "", text) # Empty square brackets
+ text = re.sub(r"\{\s*\}", "", text) # Empty curly brackets
+
return text
-
+
@classmethod
def clean_for_chunking(cls, text: str) -> str:
"""Clean text for semantic chunking - preserves structure but normalizes content."""
if not text:
return ""
-
+
# Apply all cleaning steps while preserving paragraph structure
text = cls.normalize_escape_sequences(text)
text = cls.normalize_unicode(text)
text = cls.normalize_quotes(text)
text = cls.remove_artifacts(text)
text = cls.normalize_whitespace(text, preserve_paragraphs=True)
-
+
return text
-
+
@classmethod
def clean_for_embedding(cls, text: str) -> str:
"""Clean text for embedding - flattens structure and optimizes for vector similarity."""
if not text:
return ""
-
+
# Start with chunking-level cleaning
text = cls.clean_for_chunking(text)
-
+
# Flatten for embedding
text = cls.normalize_whitespace(text, preserve_paragraphs=False)
-
+
return text
-
+
@classmethod
def clean_for_storage(cls, text: str) -> str:
"""Clean text for storage - most aggressive cleaning for database storage."""
if not text:
return ""
-
+
# Start with embedding-level cleaning
text = cls.clean_for_embedding(text)
-
+
# Additional aggressive cleaning for storage
- text = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text) # Remove any remaining escape sequences
-
+ text = re.sub(
+ r"\\([^a-zA-Z0-9\s])", r"\1", text
+ ) # Remove any remaining escape sequences
+
return text
@@ -374,37 +382,43 @@ def clean_text_content(text: str) -> str:
return TextCleaner.clean_for_chunking(text)
-def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+def create_semantic_chunks(
+ text: str, max_chunk_size: int, overlap_size: int
+) -> TypingList[str]:
"""Create semantically aware chunks that respect document structure"""
if not text or len(text) <= max_chunk_size:
return [text] if text else []
-
+
chunks = []
-
+
# Split by double line breaks (paragraphs) first
- paragraphs = text.split('\n\n')
-
+ paragraphs = text.split("\n\n")
+
current_chunk = ""
-
+
for paragraph in paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
-
+
# If adding this paragraph would exceed chunk size
if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
# Try to split the current chunk at sentence boundaries if it's too long
if len(current_chunk) > max_chunk_size:
- sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+ sentence_chunks = split_by_sentences(
+ current_chunk, max_chunk_size, overlap_size
+ )
chunks.extend(sentence_chunks)
else:
chunks.append(current_chunk.strip())
-
+
# Start new chunk with overlap from previous chunk if applicable
if chunks and overlap_size > 0:
prev_chunk = chunks[-1]
overlap_text = get_text_overlap(prev_chunk, overlap_size)
- current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+ current_chunk = (
+ overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+ )
else:
current_chunk = paragraph
else:
@@ -413,39 +427,45 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
current_chunk += "\n\n" + paragraph
else:
current_chunk = paragraph
-
+
# Add the last chunk
if current_chunk:
if len(current_chunk) > max_chunk_size:
- sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+ sentence_chunks = split_by_sentences(
+ current_chunk, max_chunk_size, overlap_size
+ )
chunks.extend(sentence_chunks)
else:
chunks.append(current_chunk.strip())
-
+
return [chunk for chunk in chunks if chunk.strip()]
-def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+def split_by_sentences(
+ text: str, max_chunk_size: int, overlap_size: int
+) -> TypingList[str]:
"""Split text by sentences when paragraph-level splitting isn't sufficient"""
# Split by sentence endings
- sentences = re.split(r'(?<=[.!?])\s+', text)
-
+ sentences = re.split(r"(?<=[.!?])\s+", text)
+
chunks = []
current_chunk = ""
-
+
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
-
+
# If adding this sentence would exceed chunk size
if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size:
chunks.append(current_chunk.strip())
-
+
# Start new chunk with overlap
if overlap_size > 0:
overlap_text = get_text_overlap(current_chunk, overlap_size)
- current_chunk = overlap_text + " " + sentence if overlap_text else sentence
+ current_chunk = (
+ overlap_text + " " + sentence if overlap_text else sentence
+ )
else:
current_chunk = sentence
else:
@@ -454,11 +474,11 @@ def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> Typ
current_chunk += " " + sentence
else:
current_chunk = sentence
-
+
# Add the last chunk
if current_chunk:
chunks.append(current_chunk.strip())
-
+
return [chunk for chunk in chunks if chunk.strip()]
@@ -466,18 +486,18 @@ def get_text_overlap(text: str, overlap_size: int) -> str:
"""Get the last overlap_size characters from text, preferring word boundaries"""
if not text or overlap_size <= 0:
return ""
-
+
if len(text) <= overlap_size:
return text
-
+
# Try to find a good word boundary within the overlap region
overlap_text = text[-overlap_size:]
-
+
# Find the first space to avoid cutting words
- space_index = overlap_text.find(' ')
+ space_index = overlap_text.find(" ")
if space_index > 0:
return overlap_text[space_index:].strip()
-
+
return overlap_text.strip()
@@ -570,7 +590,8 @@ async def update_embedding_config(
request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
):
log.info(
- f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
+ f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} "
+ f"to {form_data.embedding_model}"
)
try:
request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
@@ -1396,34 +1417,33 @@ def save_docs_to_vector_db(
if split:
# Apply advanced content-aware splitting and text cleaning
processed_docs = []
-
+
for doc in docs:
# Clean the text content before chunking
if not doc.page_content:
continue
-
+
# Apply text cleaning before chunking using new modular system
cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
-
+
# Create semantic chunks from cleaned content
chunks = create_semantic_chunks(
cleaned_content,
request.app.state.config.CHUNK_SIZE,
- request.app.state.config.CHUNK_OVERLAP
+ request.app.state.config.CHUNK_OVERLAP,
)
-
+
# Create new documents for each chunk
for i, chunk in enumerate(chunks):
chunk_metadata = {
**doc.metadata,
"chunk_index": i,
- "total_chunks": len(chunks)
+ "total_chunks": len(chunks),
}
- processed_docs.append(Document(
- page_content=chunk,
- metadata=chunk_metadata
- ))
-
+ processed_docs.append(
+ Document(page_content=chunk, metadata=chunk_metadata)
+ )
+
docs = processed_docs
if len(docs) == 0:
@@ -1501,7 +1521,7 @@ def save_docs_to_vector_db(
# Prepare texts for embedding using the new modular cleaning system
cleaned_texts = [TextCleaner.clean_for_embedding(text) for text in texts]
-
+
embeddings = embedding_function(
cleaned_texts,
prefix=RAG_EMBEDDING_CONTENT_PREFIX,
@@ -1513,13 +1533,15 @@ def save_docs_to_vector_db(
for idx in range(len(texts)):
# Apply consistent storage-level cleaning
text_to_store = TextCleaner.clean_for_storage(texts[idx])
-
- items.append({
- "id": str(uuid.uuid4()),
- "text": text_to_store,
- "vector": embeddings[idx],
- "metadata": metadatas[idx],
- })
+
+ items.append(
+ {
+ "id": str(uuid.uuid4()),
+ "text": text_to_store,
+ "vector": embeddings[idx],
+ "metadata": metadatas[idx],
+ }
+ )
VECTOR_DB_CLIENT.insert(
collection_name=collection_name,
@@ -1565,7 +1587,9 @@ def process_file(
docs = [
Document(
- page_content=TextCleaner.clean_for_chunking(form_data.content.replace("
", "\n")),
+ page_content=TextCleaner.clean_for_chunking(
+ form_data.content.replace("
", "\n")
+ ),
metadata={
**file.meta,
"name": file.filename,
@@ -1588,7 +1612,9 @@ def process_file(
if result is not None and len(result.ids[0]) > 0:
docs = [
Document(
- page_content=TextCleaner.clean_for_chunking(result.documents[0][idx]),
+ page_content=TextCleaner.clean_for_chunking(
+ result.documents[0][idx]
+ ),
metadata=result.metadatas[0][idx],
)
for idx, id in enumerate(result.ids[0])
@@ -1596,7 +1622,9 @@ def process_file(
else:
docs = [
Document(
- page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
+ page_content=TextCleaner.clean_for_chunking(
+ file.data.get("content", "")
+ ),
metadata={
**file.meta,
"name": file.filename,
@@ -1645,22 +1673,26 @@ def process_file(
cleaned_docs = []
for doc in docs:
cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
-
- cleaned_docs.append(Document(
- page_content=cleaned_content,
- metadata={
- **doc.metadata,
- "name": file.filename,
- "created_by": file.user_id,
- "file_id": file.id,
- "source": file.filename,
- },
- ))
+
+ cleaned_docs.append(
+ Document(
+ page_content=cleaned_content,
+ metadata={
+ **doc.metadata,
+ "name": file.filename,
+ "created_by": file.user_id,
+ "file_id": file.id,
+ "source": file.filename,
+ },
+ )
+ )
docs = cleaned_docs
else:
docs = [
Document(
- page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
+ page_content=TextCleaner.clean_for_chunking(
+ file.data.get("content", "")
+ ),
metadata={
**file.meta,
"name": file.filename,
@@ -1670,7 +1702,9 @@ def process_file(
},
)
]
- text_content = " ".join([doc.page_content for doc in docs if doc.page_content])
+ text_content = " ".join(
+ [doc.page_content for doc in docs if doc.page_content]
+ )
# Ensure text_content is never None or empty for hash calculation
if not text_content:
@@ -2449,7 +2483,9 @@ def process_files_batch(
docs: List[Document] = [
Document(
- page_content=TextCleaner.clean_for_chunking(text_content.replace("
", "\n")),
+ page_content=TextCleaner.clean_for_chunking(
+ text_content.replace("
", "\n")
+ ),
metadata={
**file.meta,
"name": file.filename,
@@ -2509,10 +2545,10 @@ def delete_file_from_vector_db(file_id: str) -> bool:
Delete all vector embeddings for a specific file from the vector database.
This function works with any vector database (Pinecone, ChromaDB, etc.) and
handles the cleanup when a file is deleted from the chat.
-
+
Args:
file_id (str): The ID of the file to delete from vector database
-
+
Returns:
bool: True if deletion was successful, False otherwise
"""
@@ -2521,30 +2557,32 @@ def delete_file_from_vector_db(file_id: str) -> bool:
file = Files.get_file_by_id(file_id)
if not file:
return False
-
+
# Get the file hash for vector deletion
file_hash = file.hash
if not file_hash:
return False
-
+
# Try to get collection name from file metadata
collection_name = None
- if hasattr(file, 'meta') and file.meta:
- collection_name = file.meta.get('collection_name')
-
+ if hasattr(file, "meta") and file.meta:
+ collection_name = file.meta.get("collection_name")
+
# If no collection name in metadata, try common patterns used by Open WebUI
if not collection_name:
# Open WebUI typically uses these patterns:
possible_collections = [
f"open-webui_file-{file_id}", # Most common pattern
- f"file-{file_id}", # Alternative pattern
- f"open-webui_{file_id}", # Another possible pattern
+ f"file-{file_id}", # Alternative pattern
+ f"open-webui_{file_id}", # Another possible pattern
]
-
+
# Try each possible collection name
for possible_collection in possible_collections:
try:
- if VECTOR_DB_CLIENT.has_collection(collection_name=possible_collection):
+ if VECTOR_DB_CLIENT.has_collection(
+ collection_name=possible_collection
+ ):
result = VECTOR_DB_CLIENT.delete(
collection_name=possible_collection,
filter={"hash": file_hash},
@@ -2553,19 +2591,21 @@ def delete_file_from_vector_db(file_id: str) -> bool:
return True
except Exception as e:
continue
-
+
# If none of the standard patterns work, try searching through all collections
try:
deleted_count = 0
-
+
# Get all collections (this method varies by vector DB implementation)
- if hasattr(VECTOR_DB_CLIENT, 'list_collections'):
+ if hasattr(VECTOR_DB_CLIENT, "list_collections"):
try:
collections = VECTOR_DB_CLIENT.list_collections()
-
+
for collection in collections:
try:
- if VECTOR_DB_CLIENT.has_collection(collection_name=collection):
+ if VECTOR_DB_CLIENT.has_collection(
+ collection_name=collection
+ ):
result = VECTOR_DB_CLIENT.delete(
collection_name=collection,
filter={"hash": file_hash},
@@ -2576,14 +2616,16 @@ def delete_file_from_vector_db(file_id: str) -> bool:
continue
except Exception as e:
pass
-
+
return deleted_count > 0
-
+
except Exception as e:
return False
-
+
# Delete from the specific collection found in metadata
- if collection_name and VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
+ if collection_name and VECTOR_DB_CLIENT.has_collection(
+ collection_name=collection_name
+ ):
try:
result = VECTOR_DB_CLIENT.delete(
collection_name=collection_name,
@@ -2596,6 +2638,6 @@ def delete_file_from_vector_db(file_id: str) -> bool:
return False
else:
return False
-
+
except Exception as e:
return False