AI systems behave differently in production than in testing. Real user inputs are messier, edge cases are weirder, and the distribution shifts over time. Safe production testing is essential for maintaining quality.
Here’s how to test AI systems in production.
Why Production Testing
The Testing Gap
testing_gap:
pre_production:
- Clean test data
- Known scenarios
- Limited volume
- Controlled environment
production:
- Messy real inputs
- Unknown edge cases
- Scale and load
- Changing context
implications:
- Pre-prod passing doesn't guarantee prod quality
- Issues emerge only at scale
- User behavior differs from test scenarios
Production Testing Strategies
Shadow Mode
class ShadowTester:
"""Run new model in shadow alongside production."""
async def handle_request(
self,
request: Request
) -> Response:
# Production model serves user
production_task = asyncio.create_task(
self.production_model.generate(request)
)
# Shadow model runs but doesn't serve
shadow_task = asyncio.create_task(
self.shadow_model.generate(request)
)
# Wait for production (user-facing)
production_response = await production_task
# Collect shadow result (async, don't block)
asyncio.create_task(
self._compare_and_log(
request,
production_response,
shadow_task
)
)
return production_response
async def _compare_and_log(
self,
request: Request,
production: Response,
shadow_task: asyncio.Task
):
try:
shadow_response = await asyncio.wait_for(
shadow_task, timeout=30
)
comparison = await self._compare_responses(
production, shadow_response
)
await self.log_comparison(
request=request,
production=production,
shadow=shadow_response,
comparison=comparison
)
if comparison.significant_difference:
await self.alert_difference(comparison)
except asyncio.TimeoutError:
await self.log_shadow_timeout(request)
Canary Deployments
class CanaryDeployer:
"""Gradual rollout with automatic rollback."""
async def deploy_canary(
self,
new_model: Model,
canary_percentage: float = 5.0
) -> DeploymentResult:
deployment = CanaryDeployment(
new_model=new_model,
percentage=canary_percentage,
started_at=datetime.utcnow()
)
# Monitor during canary
while deployment.is_active:
metrics = await self._collect_metrics(deployment)
if self._should_rollback(metrics):
await self._rollback(deployment)
return DeploymentResult(
status="rolled_back",
reason=metrics.failure_reason
)
if self._should_promote(metrics, deployment):
deployment.percentage = min(
deployment.percentage * 2,
100
)
await asyncio.sleep(60) # Check every minute
return DeploymentResult(status="promoted")
def _should_rollback(self, metrics: CanaryMetrics) -> bool:
return (
metrics.error_rate > 0.05 or
metrics.latency_p99 > 30000 or
metrics.quality_score < 0.8
)
Feature Flags
class AIFeatureFlags:
"""Control AI features with flags."""
async def should_use_new_model(
self,
user_id: str,
feature: str
) -> bool:
flag = await self.flags.get(f"ai_model_{feature}")
if flag.status == "off":
return False
if flag.status == "percentage":
return self._in_percentage(user_id, flag.percentage)
if flag.status == "allowlist":
return user_id in flag.allowed_users
return True
async def generate_with_flag(
self,
request: Request,
feature: str
) -> Response:
use_new = await self.should_use_new_model(
request.user_id, feature
)
model = self.new_model if use_new else self.current_model
response = await model.generate(request)
response.metadata["model_version"] = model.version
return response
A/B Testing
class AIABTest:
"""A/B test AI variants."""
async def run_test(
self,
test_name: str,
request: Request
) -> Response:
# Assign user to variant
variant = self._assign_variant(
test_name,
request.user_id
)
# Generate with variant
model = self.variants[variant]
response = await model.generate(request)
# Track for analysis
await self.track_event(
test_name=test_name,
variant=variant,
user_id=request.user_id,
response_id=response.id
)
return response
async def analyze_test(self, test_name: str) -> ABTestResult:
data = await self.get_test_data(test_name)
metrics_by_variant = {}
for variant in data.variants:
variant_data = data.filter(variant=variant)
metrics_by_variant[variant] = {
"quality_score": variant_data.mean("quality_score"),
"user_satisfaction": variant_data.mean("satisfaction"),
"task_completion": variant_data.mean("completed"),
"sample_size": len(variant_data)
}
winner = self._statistical_analysis(metrics_by_variant)
return ABTestResult(
metrics=metrics_by_variant,
winner=winner,
confidence=winner.confidence if winner else None
)
Safety Mechanisms
production_testing_safety:
guardrails:
- Maximum canary percentage
- Automatic rollback triggers
- Human approval for promotion
- Rate limiting on new code paths
monitoring:
- Real-time quality metrics
- Error rate dashboards
- User feedback collection
- Cost tracking
rollback:
- One-click rollback
- Automatic on threshold breach
- Clear ownership
Key Takeaways
- Pre-production testing is necessary but insufficient
- Shadow mode tests without risk
- Canary deployments catch issues early
- Feature flags enable granular control
- A/B testing validates improvements
- Always have automatic rollback
- Monitor quality, not just errors
- Production testing is continuous
- Start small, expand carefully
Test in production safely. It’s the only way to know for sure.