A/B Testing LLM Outputs: Complete Guide
Statistical comparison of LLM variants in production — practical implementation
A/B Testing LLM Outputs: Complete Guide
Statistical comparison of LLM variants in production — practical implementation
A/B Testing LLM Outputs Overview Statistical comparison of LLM variants in production. Rigorous evaluation is essential for building trustworthy AI applications. Why Evaluation Matters Without proper evaluation, you cannot: - Know if your model i
A/B Testing LLM Outputs
Overview
Statistical comparison of LLM variants in production. Rigorous evaluation is essential for building trustworthy AI applications.
Why Evaluation Matters
Without proper evaluation, you cannot:
Evaluation Framework
python
from dataclasses import dataclass, field
from typing import Callable, Optional
import statistics@dataclass
class EvalExample:
"""Single evaluation example."""
id: str
input: str
expected_output: str
metadata: dict = field(default_factory=dict)
@dataclass
class EvalResult:
"""Result for a single evaluation."""
example_id: str
model_output: str
score: float
metrics: dict = field(default_factory=dict)
passed: bool = True
notes: str = ""
class Evaluator:
"""A/B Testing LLM Outputs evaluator."""
def __init__(self, model_fn: Callable, metrics: list[Callable]):
self.model_fn = model_fn
self.metrics = metrics
def evaluate_single(self, example: EvalExample) -> EvalResult:
"""Evaluate one example."""
output = self.model_fn(example.input)
scores = {}
for metric in self.metrics:
score = metric(output, example.expected_output)
scores[metric.__name__] = score
overall_score = statistics.mean(scores.values()) if scores else 0.0
return EvalResult(
example_id=example.id,
model_output=output,
score=overall_score,
metrics=scores,
passed=overall_score >= 0.7
)
def evaluate_dataset(self, examples: list[EvalExample]) -> dict:
"""Evaluate a full dataset."""
results = [self.evaluate_single(ex) for ex in examples]
scores = [r.score for r in results]
passed = [r for r in results if r.passed]
failed = [r for r in results if not r.passed]
return {
"total": len(results),
"passed": len(passed),
"failed": len(failed),
"pass_rate": len(passed) / len(results),
"avg_score": statistics.mean(scores),
"min_score": min(scores),
"max_score": max(scores),
"p50": statistics.median(scores),
"results": results
}
Key Metrics Implementation
python
from openai import OpenAI
import reclient = OpenAI()
def rouge_l_score(prediction: str, reference: str) -> float:
"""Compute ROUGE-L similarity."""
pred_tokens = set(prediction.lower().split())
ref_tokens = set(reference.lower().split())
if not ref_tokens:
return 0.0
intersection = pred_tokens & ref_tokens
precision = len(intersection) / max(len(pred_tokens), 1)
recall = len(intersection) / len(ref_tokens)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
def llm_judge_score(prediction: str, reference: str, criteria: str = "quality") -> float:
"""Use GPT-4 as an automated judge."""
prompt = f"""Rate the following AI response on a scale of 1-10.
Criteria: {criteria}
Reference answer: {reference}
AI response: {prediction}
Rating (just the number 1-10):"""
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=5
)
try:
score = float(re.findall(r'\d+', resp.choices[0].message.content)[0])
return min(max(score / 10, 0), 1.0)
except:
return 0.5
def exact_match(prediction: str, reference: str) -> float:
"""Exact string match score."""
return 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0
Running Evaluations
python
Create test dataset
test_examples = [
EvalExample(
id="test_001",
input="What is the capital of France?",
expected_output="Paris",
metadata={"category": "geography", "difficulty": "easy"}
),
EvalExample(
id="test_002",
input="Explain how neural networks learn",
expected_output="Neural networks learn through backpropagation...",
metadata={"category": "ml_concepts", "difficulty": "medium"}
),
]Define model function
def my_model(input_text: str) -> str:
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": input_text}],
temperature=0,
max_tokens=500
)
return resp.choices[0].message.contentCreate evaluator with metrics
evaluator = Evaluator(
model_fn=my_model,
metrics=[rouge_l_score, exact_match]
)Run evaluation
results = evaluator.evaluate_dataset(test_examples)
print(f"Pass rate: {results['pass_rate']:.1%}")
print(f"Average score: {results['avg_score']:.3f}")
print(f"Failed: {results['failed']} / {results['total']}")
Continuous Evaluation in CI/CD
yaml
.github/workflows/eval.yml
name: LLM Evaluationon: [push, pull_request]
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run evaluations
run: python -m eval.run_suite
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PASS_THRESHOLD: "0.8"
- name: Post results
uses: actions/github-script@v7
with:
script: |
const results = require('./eval_results.json')
github.rest.issues.createComment({...})
Best Practices
Resources
相关工具
相关教程
Automating model quality checks in CI/CD pipelines — practical implementation
Evaluating Retrieval-Augmented Generation quality with RAGAS — practical implementation
Metrics and frameworks for measuring AI agent performance — practical implementation