LLM Output Guardrails
Implementing input/output guardrails for production AI applications
LLM Output Guardrails
Overview
Implementing input/output guardrails for production AI applications. This guide covers practical implementation strategies for production AI systems.
Why It Matters
Core Implementation
python
from typing import Optional, Tuple
import logginglogger = logging.getLogger("ai_safety")
class SafetyCheck:
"""LLM Output Guardrails implementation."""
def __init__(self, config: dict = None):
self.config = config or {}
self.enabled = self.config.get("enabled", True)
def validate(self, text: str, context: dict = None) -> Tuple[bool, Optional[str]]:
"""
Validate text for safety issues.
Returns: (is_safe, reason_if_unsafe)
"""
if not self.enabled:
return True, None
# Implement specific checks here
issues = self._run_checks(text, context or {})
if issues:
logger.warning(f"Safety issue detected: {issues}")
return False, "; ".join(issues)
return True, None
def _run_checks(self, text: str, context: dict) -> list[str]:
"""Run all safety checks. Override in subclasses."""
return []
class SafetyPipeline:
"""Chain multiple safety checks together."""
def __init__(self, checks: list[SafetyCheck]):
self.checks = checks
def run(self, text: str, stage: str = "input") -> Tuple[bool, list[str]]:
all_issues = []
for check in self.checks:
is_safe, reason = check.validate(text)
if not is_safe:
all_issues.append(reason)
# Block on first critical failure
return False, all_issues
return True, all_issues
Usage in LLM application
def safe_completion(user_input: str, llm_fn, pipeline: SafetyPipeline) -> str:
# Check input
is_safe, issues = pipeline.run(user_input, stage="input")
if not is_safe:
return f"Request cannot be processed: safety policy violation."
# Get LLM response
response = llm_fn(user_input)
# Check output
is_safe, issues = pipeline.run(response, stage="output")
if not is_safe:
return "Response filtered: content policy."
return response
Monitoring
python
from datetime import datetimeclass SafetyMonitor:
def __init__(self):
self.incidents = []
def record(self, incident_type: str, details: dict):
self.incidents.append({
"type": incident_type,
"timestamp": datetime.utcnow().isoformat(),
"details": details
})
if incident_type == "BLOCKED":
logger.warning(f"Safety block: {details}")
def summary(self) -> dict:
total = len(self.incidents)
blocked = sum(1 for i in self.incidents if i["type"] == "BLOCKED")
return {"total": total, "blocked": blocked, "rate": blocked/max(total,1)}
Testing Safety Measures
python
Always test with adversarial examples
test_cases = [
("Normal question", "What is machine learning?", should_pass := True),
("Boundary test", "How do I pick a lock?", should_pass := True),
# Add domain-specific tests
]pipeline = SafetyPipeline([SafetyCheck()])
for name, text, expected in test_cases:
is_safe, _ = pipeline.run(text)
result = "PASS" if is_safe == expected else "FAIL"
print(f"{result}: {name}")
Best Practices
Resources
Also available in 中文.