AI in A/B Testing: Statistical Experimentation for ML Systems
Run rigorous experiments to improve AI model performance
AI in A/B Testing and Experimentation
Why Experimentation Matters for AI
AI systems need continuous evaluation:Shadow Testing
Run new model alongside old without affecting users:python
class ShadowTesting:
def __init__(self, current_model, shadow_model):
self.current = current_model
self.shadow = shadow_model
self.log = []
def predict(self, input_data):
# Current model serves user
current_prediction = self.current.predict(input_data)
# Shadow model runs in background
try:
shadow_prediction = self.shadow.predict(input_data)
self.log.append({
"input": input_data,
"current": current_prediction,
"shadow": shadow_prediction
})
except Exception as e:
log_error("shadow_model_error", e)
return current_prediction # Return current model result
Canary Deployments
python
import randomclass CanaryRouter:
def __init__(self, canary_percentage: float = 0.05):
self.canary_percentage = canary_percentage
def get_model(self, user_id: str):
# Deterministic assignment based on user ID
is_canary = hash(user_id) % 100 < (self.canary_percentage * 100)
if is_canary:
return "new_model_v2"
return "current_model_v1"
Bayesian A/B Testing
python
import scipy.stats as stats
import numpy as npdef bayesian_ab_test(control_conversions, control_visitors,
treatment_conversions, treatment_visitors,
n_samples=100000):
# Beta distributions for each variant
control_prior = stats.beta(1 + control_conversions,
1 + control_visitors - control_conversions)
treatment_prior = stats.beta(1 + treatment_conversions,
1 + treatment_visitors - treatment_conversions)
# Sample from posteriors
control_samples = control_prior.rvs(n_samples)
treatment_samples = treatment_prior.rvs(n_samples)
# Probability treatment is better
prob_better = np.mean(treatment_samples > control_samples)
expected_lift = np.mean(treatment_samples / control_samples - 1)
return {
"prob_treatment_better": prob_better,
"expected_lift": expected_lift,
"decision": "ship" if prob_better > 0.95 else "continue"
}
Multi-Armed Bandit for Continuous Optimization
python
class EpsilonGreedy:
def __init__(self, n_arms, epsilon=0.1):
self.n_arms = n_arms
self.epsilon = epsilon
self.counts = np.zeros(n_arms)
self.values = np.zeros(n_arms)
def select_arm(self):
if np.random.random() < self.epsilon:
return np.random.randint(self.n_arms) # Explore
return np.argmax(self.values) # Exploit
def update(self, arm, reward):
self.counts[arm] += 1
n = self.counts[arm]
self.values[arm] += (reward - self.values[arm]) / n
Sample Size Calculator
Determine minimum experiment duration:python
from statsmodels.stats.power import TTestIndPowerdef calculate_sample_size(effect_size=0.1, alpha=0.05, power=0.8):
analysis = TTestIndPower()
return analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power)
Also available in 中文.