← Back to tutorials

AI Output Verification

Verifying correctness and safety of AI-generated content

AI Output Verification

Overview

Verifying correctness and safety of AI-generated content. This guide covers practical implementation strategies for production AI systems.

Why It Matters

Core Implementation

python
from typing import Optional, Tuple
import logging

logger = logging.getLogger("ai_safety")

class SafetyCheck: """AI Output Verification implementation.""" def __init__(self, config: dict = None): self.config = config or {} self.enabled = self.config.get("enabled", True) def validate(self, text: str, context: dict = None) -> Tuple[bool, Optional[str]]: """ Validate text for safety issues. Returns: (is_safe, reason_if_unsafe) """ if not self.enabled: return True, None # Implement specific checks here issues = self._run_checks(text, context or {}) if issues: logger.warning(f"Safety issue detected: {issues}") return False, "; ".join(issues) return True, None def _run_checks(self, text: str, context: dict) -> list[str]: """Run all safety checks. Override in subclasses.""" return []

class SafetyPipeline: """Chain multiple safety checks together.""" def __init__(self, checks: list[SafetyCheck]): self.checks = checks def run(self, text: str, stage: str = "input") -> Tuple[bool, list[str]]: all_issues = [] for check in self.checks: is_safe, reason = check.validate(text) if not is_safe: all_issues.append(reason) # Block on first critical failure return False, all_issues return True, all_issues

Usage in LLM application

def safe_completion(user_input: str, llm_fn, pipeline: SafetyPipeline) -> str: # Check input is_safe, issues = pipeline.run(user_input, stage="input") if not is_safe: return f"Request cannot be processed: safety policy violation." # Get LLM response response = llm_fn(user_input) # Check output is_safe, issues = pipeline.run(response, stage="output") if not is_safe: return "Response filtered: content policy." return response

Monitoring

python
from datetime import datetime

class SafetyMonitor: def __init__(self): self.incidents = [] def record(self, incident_type: str, details: dict): self.incidents.append({ "type": incident_type, "timestamp": datetime.utcnow().isoformat(), "details": details }) if incident_type == "BLOCKED": logger.warning(f"Safety block: {details}") def summary(self) -> dict: total = len(self.incidents) blocked = sum(1 for i in self.incidents if i["type"] == "BLOCKED") return {"total": total, "blocked": blocked, "rate": blocked/max(total,1)}

Testing Safety Measures

python

Always test with adversarial examples

test_cases = [ ("Normal question", "What is machine learning?", should_pass := True), ("Boundary test", "How do I pick a lock?", should_pass := True), # Add domain-specific tests ]

pipeline = SafetyPipeline([SafetyCheck()]) for name, text, expected in test_cases: is_safe, _ = pipeline.run(text) result = "PASS" if is_safe == expected else "FAIL" print(f"{result}: {name}")

Best Practices

Resources

Also available in 中文.