“It looks good to me” is not evaluation. As LLM applications mature, rigorous evaluation becomes essential. Without proper evaluation, you can’t improve systematically, detect regressions, or compare approaches. Moving beyond vibes requires frameworks.
Here’s how to evaluate LLM applications properly.
Why Evaluation Matters
The Problem with Vibes
vibes_based_evaluation:
what_happens:
- "Try a few examples, looks good"
- Ship to production
- Problems emerge at scale
- No data to diagnose
consequences:
- Silent quality degradation
- Can't compare approaches
- No regression detection
- Improvement is guesswork
What Good Evaluation Enables
evaluation_benefits:
quality_assurance:
- Catch issues before production
- Detect regressions
- Maintain quality bar
improvement:
- Compare approaches objectively
- Identify weakness areas
- Guide optimization
confidence:
- Ship with data-backed confidence
- Communicate quality to stakeholders
- Make informed trade-offs
Evaluation Framework
Types of Evaluation
evaluation_types:
offline_evaluation:
when: Before deployment
how: Test datasets, automated metrics
purpose: Catch issues early
online_evaluation:
when: In production
how: A/B tests, user metrics
purpose: Validate real-world performance
human_evaluation:
when: Periodically, for calibration
how: Expert review, user feedback
purpose: Ground truth for automated metrics
Building a Test Dataset
from dataclasses import dataclass
from typing import Optional
@dataclass
class TestCase:
id: str
input: str
expected_output: str # or acceptable outputs
category: str
difficulty: str
metadata: dict
class TestDataset:
def __init__(self):
self.cases = []
def add_case(self, case: TestCase):
self.cases.append(case)
def add_from_production(self, request, response, label):
"""Add cases from labeled production data."""
case = TestCase(
id=str(uuid.uuid4()),
input=request,
expected_output=response,
category="production",
difficulty="unknown",
metadata={"source": "production", "label": label}
)
self.cases.append(case)
def get_by_category(self, category: str) -> list[TestCase]:
return [c for c in self.cases if c.category == category]
# Build dataset with diverse cases
dataset = TestDataset()
dataset.add_case(TestCase(
id="simple-1",
input="What is 2+2?",
expected_output="4",
category="math",
difficulty="easy",
metadata={}
))
Automated Metrics
class EvaluationMetrics:
"""Automated metrics for LLM evaluation."""
def exact_match(self, predicted: str, expected: str) -> float:
return 1.0 if predicted.strip() == expected.strip() else 0.0
def contains_expected(self, predicted: str, expected: str) -> float:
return 1.0 if expected.lower() in predicted.lower() else 0.0
def semantic_similarity(self, predicted: str, expected: str) -> float:
pred_emb = self.encoder.encode(predicted)
exp_emb = self.encoder.encode(expected)
return float(cosine_similarity([pred_emb], [exp_emb])[0][0])
def format_compliance(self, predicted: str, schema: dict) -> float:
"""Check if output matches expected format."""
try:
data = json.loads(predicted)
jsonschema.validate(data, schema)
return 1.0
except:
return 0.0
def length_ratio(self, predicted: str, expected: str) -> float:
"""Check if length is appropriate."""
pred_len = len(predicted.split())
exp_len = len(expected.split())
ratio = min(pred_len, exp_len) / max(pred_len, exp_len)
return ratio
LLM-as-Judge
class LLMJudge:
"""Use an LLM to evaluate another LLM's output."""
def __init__(self, judge_model: str = "gpt-4"):
self.judge = OpenAI()
self.model = judge_model
def evaluate(self, question: str, response: str, criteria: list[str]) -> dict:
prompt = f"""Evaluate this response to the question.
Question: {question}
Response: {response}
Rate on these criteria (1-5 scale):
{chr(10).join(f'- {c}' for c in criteria)}
Return JSON:
{{"criteria_name": score, ...}}
"""
result = self.judge.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(result.choices[0].message.content)
# Usage
judge = LLMJudge()
scores = judge.evaluate(
question="Explain microservices",
response="Microservices are...",
criteria=["accuracy", "completeness", "clarity", "conciseness"]
)
Evaluation Pipeline
Automated Evaluation
class EvaluationPipeline:
def __init__(self, model, dataset: TestDataset, metrics: EvaluationMetrics):
self.model = model
self.dataset = dataset
self.metrics = metrics
def run(self) -> EvaluationReport:
results = []
for case in self.dataset.cases:
# Generate response
response = self.model.generate(case.input)
# Calculate metrics
case_metrics = {
"semantic_similarity": self.metrics.semantic_similarity(
response, case.expected_output
),
"contains_expected": self.metrics.contains_expected(
response, case.expected_output
),
}
results.append({
"case_id": case.id,
"category": case.category,
"input": case.input,
"expected": case.expected_output,
"actual": response,
"metrics": case_metrics
})
return EvaluationReport(
results=results,
aggregate=self._aggregate(results),
by_category=self._by_category(results)
)
def _aggregate(self, results) -> dict:
metrics = {}
for metric in ["semantic_similarity", "contains_expected"]:
values = [r["metrics"][metric] for r in results]
metrics[metric] = {
"mean": sum(values) / len(values),
"min": min(values),
"max": max(values)
}
return metrics
Regression Detection
class RegressionDetector:
def __init__(self, threshold: float = 0.05):
self.threshold = threshold
self.baseline = None
def set_baseline(self, report: EvaluationReport):
self.baseline = report.aggregate
def check(self, report: EvaluationReport) -> list[str]:
if not self.baseline:
return []
regressions = []
for metric, values in report.aggregate.items():
baseline_mean = self.baseline[metric]["mean"]
current_mean = values["mean"]
if (baseline_mean - current_mean) > self.threshold:
regressions.append(
f"{metric} regressed: {baseline_mean:.3f} → {current_mean:.3f}"
)
return regressions
CI/CD Integration
Evaluation in Pipeline
# .github/workflows/eval.yml
name: LLM Evaluation
on:
pull_request:
paths:
- 'prompts/**'
- 'src/ai/**'
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Evaluation
run: python -m evaluation.run --dataset test_data.json
- name: Check for Regression
run: python -m evaluation.regression_check --baseline baseline.json
- name: Upload Report
uses: actions/upload-artifact@v3
with:
name: evaluation-report
path: evaluation_report.json
Key Takeaways
- “Vibes-based” evaluation doesn’t scale
- Build test datasets with diverse, representative cases
- Use multiple metrics: exact match, semantic similarity, format compliance
- LLM-as-judge enables scalable quality assessment
- Run evaluation in CI/CD to catch regressions
- Combine offline (pre-deploy) and online (production) evaluation
- Human evaluation calibrates automated metrics
- Categorize test cases to identify weakness areas
- Baseline and regression detection prevent quality drift
Systematic evaluation enables systematic improvement.