Embedding Quality Metrics: Complete Guide
Evaluating embedding models with MTEB and custom benchmarks — practical implementation
Embedding Quality Metrics
Overview
Evaluating embedding models with MTEB and custom benchmarks. Rigorous evaluation is essential for building trustworthy AI applications.
Why Evaluation Matters
Without proper evaluation, you cannot:
Evaluation Framework
python
from dataclasses import dataclass, field
from typing import Callable, Optional
import statistics@dataclass
class EvalExample:
"""Single evaluation example."""
id: str
input: str
expected_output: str
metadata: dict = field(default_factory=dict)
@dataclass
class EvalResult:
"""Result for a single evaluation."""
example_id: str
model_output: str
score: float
metrics: dict = field(default_factory=dict)
passed: bool = True
notes: str = ""
class Evaluator:
"""Embedding Quality Metrics evaluator."""
def __init__(self, model_fn: Callable, metrics: list[Callable]):
self.model_fn = model_fn
self.metrics = metrics
def evaluate_single(self, example: EvalExample) -> EvalResult:
"""Evaluate one example."""
output = self.model_fn(example.input)
scores = {}
for metric in self.metrics:
score = metric(output, example.expected_output)
scores[metric.__name__] = score
overall_score = statistics.mean(scores.values()) if scores else 0.0
return EvalResult(
example_id=example.id,
model_output=output,
score=overall_score,
metrics=scores,
passed=overall_score >= 0.7
)
def evaluate_dataset(self, examples: list[EvalExample]) -> dict:
"""Evaluate a full dataset."""
results = [self.evaluate_single(ex) for ex in examples]
scores = [r.score for r in results]
passed = [r for r in results if r.passed]
failed = [r for r in results if not r.passed]
return {
"total": len(results),
"passed": len(passed),
"failed": len(failed),
"pass_rate": len(passed) / len(results),
"avg_score": statistics.mean(scores),
"min_score": min(scores),
"max_score": max(scores),
"p50": statistics.median(scores),
"results": results
}
Key Metrics Implementation
python
from openai import OpenAI
import reclient = OpenAI()
def rouge_l_score(prediction: str, reference: str) -> float:
"""Compute ROUGE-L similarity."""
pred_tokens = set(prediction.lower().split())
ref_tokens = set(reference.lower().split())
if not ref_tokens:
return 0.0
intersection = pred_tokens & ref_tokens
precision = len(intersection) / max(len(pred_tokens), 1)
recall = len(intersection) / len(ref_tokens)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
def llm_judge_score(prediction: str, reference: str, criteria: str = "quality") -> float:
"""Use GPT-4 as an automated judge."""
prompt = f"""Rate the following AI response on a scale of 1-10.
Criteria: {criteria}
Reference answer: {reference}
AI response: {prediction}
Rating (just the number 1-10):"""
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=5
)
try:
score = float(re.findall(r'\d+', resp.choices[0].message.content)[0])
return min(max(score / 10, 0), 1.0)
except:
return 0.5
def exact_match(prediction: str, reference: str) -> float:
"""Exact string match score."""
return 1.0 if prediction.strip().lower() == reference.strip().lower() else 0.0
Running Evaluations
python
Create test dataset
test_examples = [
EvalExample(
id="test_001",
input="What is the capital of France?",
expected_output="Paris",
metadata={"category": "geography", "difficulty": "easy"}
),
EvalExample(
id="test_002",
input="Explain how neural networks learn",
expected_output="Neural networks learn through backpropagation...",
metadata={"category": "ml_concepts", "difficulty": "medium"}
),
]Define model function
def my_model(input_text: str) -> str:
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": input_text}],
temperature=0,
max_tokens=500
)
return resp.choices[0].message.contentCreate evaluator with metrics
evaluator = Evaluator(
model_fn=my_model,
metrics=[rouge_l_score, exact_match]
)Run evaluation
results = evaluator.evaluate_dataset(test_examples)
print(f"Pass rate: {results['pass_rate']:.1%}")
print(f"Average score: {results['avg_score']:.3f}")
print(f"Failed: {results['failed']} / {results['total']}")
Continuous Evaluation in CI/CD
yaml
.github/workflows/eval.yml
name: LLM Evaluationon: [push, pull_request]
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run evaluations
run: python -m eval.run_suite
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PASS_THRESHOLD: "0.8"
- name: Post results
uses: actions/github-script@v7
with:
script: |
const results = require('./eval_results.json')
github.rest.issues.createComment({...})
Best Practices
Resources
Also available in 中文.