Merge branch 'open-webui:main' into routellm-pipeline

2025-06-26 18:15:58 +00:00 · 2024-07-30 08:50:46 -04:00 · 2024-07-30 08:50:46 -04:00 · 7168ea3d08
commit 7168ea3d08
parent 0afe4aa849 cd6c092a53
2 changed files with 90 additions and 9 deletions
--- a/examples/filters/presidio_filter_pipeline.py
+++ b/examples/filters/presidio_filter_pipeline.py
@ -0,0 +1,81 @@
+"""
+title: Presidio PII Redaction Pipeline
+author: justinh-rahb
+date: 2024-07-07
+version: 0.1.0
+license: MIT
+description: A pipeline for redacting personally identifiable information (PII) using the Presidio library.
+requirements: presidio-analyzer, presidio-anonymizer
+"""
+
+import os
+from typing import List, Optional
+from pydantic import BaseModel
+from schemas import OpenAIChatMessage
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+
+class Pipeline:
+    class Valves(BaseModel):
+        pipelines: List[str] = ["*"]
+        priority: int = 0
+        enabled_for_admins: bool = False
+        entities_to_redact: List[str] = [
+            "PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "US_SSN", 
+            "CREDIT_CARD", "IP_ADDRESS", "US_PASSPORT", "LOCATION",
+            "DATE_TIME", "NRP", "MEDICAL_LICENSE", "URL"
+        ]
+        language: str = "en"
+
+    def __init__(self):
+        self.type = "filter"
+        self.name = "Presidio PII Redaction Pipeline"
+
+        self.valves = self.Valves(
+            **{
+                "pipelines": os.getenv("PII_REDACT_PIPELINES", "*").split(","),
+                "enabled_for_admins": os.getenv("PII_REDACT_ENABLED_FOR_ADMINS", "false").lower() == "true",
+                "entities_to_redact": os.getenv("PII_REDACT_ENTITIES", ",".join(self.Valves().entities_to_redact)).split(","),
+                "language": os.getenv("PII_REDACT_LANGUAGE", "en"),
+            }
+        )
+
+        self.analyzer = AnalyzerEngine()
+        self.anonymizer = AnonymizerEngine()
+
+    async def on_startup(self):
+        print(f"on_startup:{__name__}")
+
+    async def on_shutdown(self):
+        print(f"on_shutdown:{__name__}")
+
+    def redact_pii(self, text: str) -> str:
+        results = self.analyzer.analyze(
+            text=text,
+            language=self.valves.language,
+            entities=self.valves.entities_to_redact
+        )
+
+        anonymized_text = self.anonymizer.anonymize(
+            text=text,
+            analyzer_results=results,
+            operators={
+                "DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"})
+            }
+        )
+
+        return anonymized_text.text
+
+    async def inlet(self, body: dict, user: Optional[dict] = None) -> dict:
+        print(f"pipe:{__name__}")
+        print(body)
+        print(user)
+
+        if user is None or user.get("role") != "admin" or self.valves.enabled_for_admins:
+            messages = body.get("messages", [])
+            for message in messages:
+                if message.get("role") == "user":
+                    message["content"] = self.redact_pii(message["content"])
+
+        return body
--- a/examples/pipelines/rag/text_to_sql_pipeline.py
+++ b/examples/pipelines/rag/text_to_sql_pipeline.py
@ -38,15 +38,15 @@ class Pipeline:
        # Initialize
        self.valves = self.Valves(
            **{
-                "pipelines": ["*"],                                 # Connect to all pipelines
-                "DB_HOST": os.environ["PG_HOST"],                   # Database hostname
-                "DB_PORT": os.environ["PG_PORT"],                   # Database port 
-                "DB_USER": os.environ["PG_USER"],                   # User to connect to the database with
-                "DB_PASSWORD": os.environ["PG_PASSWORD"],           # Password to connect to the database with
-                "DB_DATABASE": os.environ["PG_DB"],                 # Database to select on the DB instance
-                "DB_TABLES": ["albums"],                            # Table(s) to run queries against 
-                "OLLAMA_HOST": "http://host.docker.internal:11434", # Make sure to update with the URL of your Ollama host, such as http://localhost:11434 or remote server address
-                "TEXT_TO_SQL_MODEL": "phi3:latest"                  # Model to use for text-to-SQL generation      
+                "pipelines": ["*"],                                                           # Connect to all pipelines
+                "DB_HOST": os.getenv("PG_HOST", "http://localhost:5432"),                     # Database hostname
+                "DB_PORT": os.getenv("PG_PORT", 5432),                                        # Database port 
+                "DB_USER": os.getenv("PG_USER", "postgres"),                                  # User to connect to the database with
+                "DB_PASSWORD": os.getenv("PG_PASSWORD", "password"),                          # Password to connect to the database with
+                "DB_DATABASE": os.getenv("PG_DB", "postgres"),                                # Database to select on the DB instance
+                "DB_TABLES": ["albums"],                                                      # Table(s) to run queries against 
+                "OLLAMA_HOST": os.getenv("OLLAMA_HOST", "http://host.docker.internal:11434"), # Make sure to update with the URL of your Ollama host, such as http://localhost:11434 or remote server address
+                "TEXT_TO_SQL_MODEL": "phi3:latest"                                            # Model to use for text-to-SQL generation      
            }
        )