LLM Output Guardrails

Implementing input/output guardrails for production AI applications

LLM Output Guardrails

Overview

Implementing input/output guardrails for production AI applications. This guide covers practical implementation strategies for production AI systems.

Why It Matters

Core Implementation

python
from typing import Optional, Tuple
import logging
logger = logging.getLogger("ai_safety")
class SafetyCheck:
    """LLM Output Guardrails implementation."""
    
    def __init__(self, config: dict = None):
        self.config = config or {}
        self.enabled = self.config.get("enabled", True)
    
    def validate(self, text: str, context: dict = None) -> Tuple[bool, Optional[str]]:
        """
        Validate text for safety issues.
        Returns: (is_safe, reason_if_unsafe)
        """
        if not self.enabled:
            return True, None
        
        # Implement specific checks here
        issues = self._run_checks(text, context or {})
        
        if issues:
            logger.warning(f"Safety issue detected: {issues}")
            return False, "; ".join(issues)
        
        return True, None
    
    def _run_checks(self, text: str, context: dict) -> list[str]:
        """Run all safety checks. Override in subclasses."""
        return []
class SafetyPipeline:
    """Chain multiple safety checks together."""
    
    def __init__(self, checks: list[SafetyCheck]):
        self.checks = checks
    
    def run(self, text: str, stage: str = "input") -> Tuple[bool, list[str]]:
        all_issues = []
        for check in self.checks:
            is_safe, reason = check.validate(text)
            if not is_safe:
                all_issues.append(reason)
                # Block on first critical failure
                return False, all_issues
        return True, all_issues
Usage in LLM application
def safe_completion(user_input: str, llm_fn, pipeline: SafetyPipeline) -> str:
    # Check input
    is_safe, issues = pipeline.run(user_input, stage="input")
    if not is_safe:
        return f"Request cannot be processed: safety policy violation."
    
    # Get LLM response
    response = llm_fn(user_input)
    
    # Check output
    is_safe, issues = pipeline.run(response, stage="output")
    if not is_safe:
        return "Response filtered: content policy."
    
    return response

Monitoring

python
from datetime import datetimeclass SafetyMonitor:
    def __init__(self):
        self.incidents = []
    
    def record(self, incident_type: str, details: dict):
        self.incidents.append({
            "type": incident_type,
            "timestamp": datetime.utcnow().isoformat(),
            "details": details
        })
        if incident_type == "BLOCKED":
            logger.warning(f"Safety block: {details}")
    
    def summary(self) -> dict:
        total = len(self.incidents)
        blocked = sum(1 for i in self.incidents if i["type"] == "BLOCKED")
        return {"total": total, "blocked": blocked, "rate": blocked/max(total,1)}

Testing Safety Measures

python
Always test with adversarial examples
test_cases = [
    ("Normal question", "What is machine learning?", should_pass := True),
    ("Boundary test", "How do I pick a lock?", should_pass := True),  
    # Add domain-specific tests
]pipeline = SafetyPipeline([SafetyCheck()])
for name, text, expected in test_cases:
    is_safe, _ = pipeline.run(text)
    result = "PASS" if is_safe == expected else "FAIL"
    print(f"{result}: {name}")

Best Practices

Resources

Also available in 中文.