diff --git a/examples/filters/presidio_filter_pipeline.py b/examples/filters/presidio_filter_pipeline.py new file mode 100644 index 0000000..cca242d --- /dev/null +++ b/examples/filters/presidio_filter_pipeline.py @@ -0,0 +1,81 @@ +""" +title: Presidio PII Redaction Pipeline +author: justinh-rahb +date: 2024-07-07 +version: 0.1.0 +license: MIT +description: A pipeline for redacting personally identifiable information (PII) using the Presidio library. +requirements: presidio-analyzer, presidio-anonymizer +""" + +import os +from typing import List, Optional +from pydantic import BaseModel +from schemas import OpenAIChatMessage +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine +from presidio_anonymizer.entities import OperatorConfig + +class Pipeline: + class Valves(BaseModel): + pipelines: List[str] = ["*"] + priority: int = 0 + enabled_for_admins: bool = False + entities_to_redact: List[str] = [ + "PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "US_SSN", + "CREDIT_CARD", "IP_ADDRESS", "US_PASSPORT", "LOCATION", + "DATE_TIME", "NRP", "MEDICAL_LICENSE", "URL" + ] + language: str = "en" + + def __init__(self): + self.type = "filter" + self.name = "Presidio PII Redaction Pipeline" + + self.valves = self.Valves( + **{ + "pipelines": os.getenv("PII_REDACT_PIPELINES", "*").split(","), + "enabled_for_admins": os.getenv("PII_REDACT_ENABLED_FOR_ADMINS", "false").lower() == "true", + "entities_to_redact": os.getenv("PII_REDACT_ENTITIES", ",".join(self.Valves().entities_to_redact)).split(","), + "language": os.getenv("PII_REDACT_LANGUAGE", "en"), + } + ) + + self.analyzer = AnalyzerEngine() + self.anonymizer = AnonymizerEngine() + + async def on_startup(self): + print(f"on_startup:{__name__}") + + async def on_shutdown(self): + print(f"on_shutdown:{__name__}") + + def redact_pii(self, text: str) -> str: + results = self.analyzer.analyze( + text=text, + language=self.valves.language, + entities=self.valves.entities_to_redact + ) + + anonymized_text = self.anonymizer.anonymize( + text=text, + analyzer_results=results, + operators={ + "DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"}) + } + ) + + return anonymized_text.text + + async def inlet(self, body: dict, user: Optional[dict] = None) -> dict: + print(f"pipe:{__name__}") + print(body) + print(user) + + if user is None or user.get("role") != "admin" or self.valves.enabled_for_admins: + messages = body.get("messages", []) + for message in messages: + if message.get("role") == "user": + message["content"] = self.redact_pii(message["content"]) + + return body diff --git a/examples/pipelines/rag/text_to_sql_pipeline.py b/examples/pipelines/rag/text_to_sql_pipeline.py index ab30e72..22f936f 100644 --- a/examples/pipelines/rag/text_to_sql_pipeline.py +++ b/examples/pipelines/rag/text_to_sql_pipeline.py @@ -38,15 +38,15 @@ class Pipeline: # Initialize self.valves = self.Valves( **{ - "pipelines": ["*"], # Connect to all pipelines - "DB_HOST": os.environ["PG_HOST"], # Database hostname - "DB_PORT": os.environ["PG_PORT"], # Database port - "DB_USER": os.environ["PG_USER"], # User to connect to the database with - "DB_PASSWORD": os.environ["PG_PASSWORD"], # Password to connect to the database with - "DB_DATABASE": os.environ["PG_DB"], # Database to select on the DB instance - "DB_TABLES": ["albums"], # Table(s) to run queries against - "OLLAMA_HOST": "http://host.docker.internal:11434", # Make sure to update with the URL of your Ollama host, such as http://localhost:11434 or remote server address - "TEXT_TO_SQL_MODEL": "phi3:latest" # Model to use for text-to-SQL generation + "pipelines": ["*"], # Connect to all pipelines + "DB_HOST": os.getenv("PG_HOST", "http://localhost:5432"), # Database hostname + "DB_PORT": os.getenv("PG_PORT", 5432), # Database port + "DB_USER": os.getenv("PG_USER", "postgres"), # User to connect to the database with + "DB_PASSWORD": os.getenv("PG_PASSWORD", "password"), # Password to connect to the database with + "DB_DATABASE": os.getenv("PG_DB", "postgres"), # Database to select on the DB instance + "DB_TABLES": ["albums"], # Table(s) to run queries against + "OLLAMA_HOST": os.getenv("OLLAMA_HOST", "http://host.docker.internal:11434"), # Make sure to update with the URL of your Ollama host, such as http://localhost:11434 or remote server address + "TEXT_TO_SQL_MODEL": "phi3:latest" # Model to use for text-to-SQL generation } )