From 48ccb1e1707cbc04c14cab98d1527a20c1c81fec Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Sun, 21 Dec 2025 13:14:29 +0100 Subject: [PATCH] fix: consolidate psql cleanup logic and fix web add with cleanup (#20072) * sequential * consolidate logic and fix for web add * Update WebSearch.svelte * Update retrieval.py * Update retrieval.py * Update WebSearch.svelte --- backend/open_webui/models/chats.py | 19 +++++------------- backend/open_webui/routers/retrieval.py | 3 ++- backend/open_webui/utils/misc.py | 26 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/backend/open_webui/models/chats.py b/backend/open_webui/models/chats.py index b4e3deaee..3c9507e0f 100644 --- a/backend/open_webui/models/chats.py +++ b/backend/open_webui/models/chats.py @@ -7,6 +7,7 @@ from typing import Optional from open_webui.internal.db import Base, get_db from open_webui.models.tags import TagModel, Tag, Tags from open_webui.models.folders import Folders +from open_webui.utils.misc import sanitize_data_for_db, sanitize_text_for_db from pydantic import BaseModel, ConfigDict from sqlalchemy import BigInteger, Boolean, Column, String, Text, JSON, Index @@ -169,18 +170,8 @@ class ChatUsageStatsListResponse(BaseModel): class ChatTable: def _clean_null_bytes(self, obj): - """ - Recursively remove actual null bytes (\x00) and unicode escape \\u0000 - from strings inside dict/list structures. - Safe for JSON objects. - """ - if isinstance(obj, str): - return obj.replace("\x00", "").replace("\u0000", "") - elif isinstance(obj, dict): - return {k: self._clean_null_bytes(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [self._clean_null_bytes(v) for v in obj] - return obj + """Recursively remove null bytes from strings in dict/list structures.""" + return sanitize_data_for_db(obj) def _sanitize_chat_row(self, chat_item): """ @@ -351,7 +342,7 @@ class ChatTable: # Sanitize message content for null characters before upserting if isinstance(message.get("content"), str): - message["content"] = message["content"].replace("\x00", "") + message["content"] = sanitize_text_for_db(message["content"]) chat = chat.chat history = chat.get("history", {}) @@ -771,7 +762,7 @@ class ChatTable: """ Filters chats based on a search query using Python, allowing pagination using skip and limit. """ - search_text = search_text.replace("\u0000", "").lower().strip() + search_text = sanitize_text_for_db(search_text).lower().strip() if not search_text: return self.get_chat_list_by_user_id( diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 8f0c76775..43839d010 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -87,6 +87,7 @@ from open_webui.retrieval.utils import ( from open_webui.retrieval.vector.utils import filter_metadata from open_webui.utils.misc import ( calculate_sha256_string, + sanitize_text_for_db, ) from open_webui.utils.auth import get_admin_user, get_verified_user @@ -1378,7 +1379,7 @@ def save_docs_to_vector_db( if len(docs) == 0: raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) - texts = [doc.page_content for doc in docs] + texts = [sanitize_text_for_db(doc.page_content) for doc in docs] metadatas = [ { **doc.metadata, diff --git a/backend/open_webui/utils/misc.py b/backend/open_webui/utils/misc.py index 85890e5af..5ee0627f3 100644 --- a/backend/open_webui/utils/misc.py +++ b/backend/open_webui/utils/misc.py @@ -373,6 +373,32 @@ def sanitize_filename(file_name): return final_file_name +def sanitize_text_for_db(text: str) -> str: + """Remove null bytes and invalid UTF-8 surrogates from text for PostgreSQL storage.""" + if not isinstance(text, str): + return text + # Remove null bytes - PostgreSQL cannot store \x00 in text fields + text = text.replace("\x00", "") + # Remove invalid UTF-8 surrogate characters that can cause encoding errors + # This handles cases where binary data or encoding issues introduced surrogates + try: + text = text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore") + except (UnicodeEncodeError, UnicodeDecodeError): + pass + return text + + +def sanitize_data_for_db(obj): + """Recursively sanitize all strings in a data structure for database storage.""" + if isinstance(obj, str): + return sanitize_text_for_db(obj) + elif isinstance(obj, dict): + return {k: sanitize_data_for_db(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [sanitize_data_for_db(v) for v in obj] + return obj + + def extract_folders_after_data_docs(path): # Convert the path to a Path object if it's not already path = Path(path)