Testing AI in Production

AI systems behave differently in production than in testing. Real user inputs are messier, edge cases are weirder, and the distribution shifts over time. Safe production testing is essential for maintaining quality.

Here’s how to test AI systems in production.

Why Production Testing

The Testing Gap

testing_gap:
  pre_production:
    - Clean test data
    - Known scenarios
    - Limited volume
    - Controlled environment

  production:
    - Messy real inputs
    - Unknown edge cases
    - Scale and load
    - Changing context

  implications:
    - Pre-prod passing doesn't guarantee prod quality
    - Issues emerge only at scale
    - User behavior differs from test scenarios

Production Testing Strategies

Shadow Mode

class ShadowTester:
    """Run new model in shadow alongside production."""

    async def handle_request(
        self,
        request: Request
    ) -> Response:
        # Production model serves user
        production_task = asyncio.create_task(
            self.production_model.generate(request)
        )

        # Shadow model runs but doesn't serve
        shadow_task = asyncio.create_task(
            self.shadow_model.generate(request)
        )

        # Wait for production (user-facing)
        production_response = await production_task

        # Collect shadow result (async, don't block)
        asyncio.create_task(
            self._compare_and_log(
                request,
                production_response,
                shadow_task
            )
        )

        return production_response

    async def _compare_and_log(
        self,
        request: Request,
        production: Response,
        shadow_task: asyncio.Task
    ):
        try:
            shadow_response = await asyncio.wait_for(
                shadow_task, timeout=30
            )

            comparison = await self._compare_responses(
                production, shadow_response
            )

            await self.log_comparison(
                request=request,
                production=production,
                shadow=shadow_response,
                comparison=comparison
            )

            if comparison.significant_difference:
                await self.alert_difference(comparison)

        except asyncio.TimeoutError:
            await self.log_shadow_timeout(request)

Canary Deployments

class CanaryDeployer:
    """Gradual rollout with automatic rollback."""

    async def deploy_canary(
        self,
        new_model: Model,
        canary_percentage: float = 5.0
    ) -> DeploymentResult:
        deployment = CanaryDeployment(
            new_model=new_model,
            percentage=canary_percentage,
            started_at=datetime.utcnow()
        )

        # Monitor during canary
        while deployment.is_active:
            metrics = await self._collect_metrics(deployment)

            if self._should_rollback(metrics):
                await self._rollback(deployment)
                return DeploymentResult(
                    status="rolled_back",
                    reason=metrics.failure_reason
                )

            if self._should_promote(metrics, deployment):
                deployment.percentage = min(
                    deployment.percentage * 2,
                    100
                )

            await asyncio.sleep(60)  # Check every minute

        return DeploymentResult(status="promoted")

    def _should_rollback(self, metrics: CanaryMetrics) -> bool:
        return (
            metrics.error_rate > 0.05 or
            metrics.latency_p99 > 30000 or
            metrics.quality_score < 0.8
        )

Feature Flags

class AIFeatureFlags:
    """Control AI features with flags."""

    async def should_use_new_model(
        self,
        user_id: str,
        feature: str
    ) -> bool:
        flag = await self.flags.get(f"ai_model_{feature}")

        if flag.status == "off":
            return False

        if flag.status == "percentage":
            return self._in_percentage(user_id, flag.percentage)

        if flag.status == "allowlist":
            return user_id in flag.allowed_users

        return True

    async def generate_with_flag(
        self,
        request: Request,
        feature: str
    ) -> Response:
        use_new = await self.should_use_new_model(
            request.user_id, feature
        )

        model = self.new_model if use_new else self.current_model

        response = await model.generate(request)
        response.metadata["model_version"] = model.version

        return response

A/B Testing

class AIABTest:
    """A/B test AI variants."""

    async def run_test(
        self,
        test_name: str,
        request: Request
    ) -> Response:
        # Assign user to variant
        variant = self._assign_variant(
            test_name,
            request.user_id
        )

        # Generate with variant
        model = self.variants[variant]
        response = await model.generate(request)

        # Track for analysis
        await self.track_event(
            test_name=test_name,
            variant=variant,
            user_id=request.user_id,
            response_id=response.id
        )

        return response

    async def analyze_test(self, test_name: str) -> ABTestResult:
        data = await self.get_test_data(test_name)

        metrics_by_variant = {}
        for variant in data.variants:
            variant_data = data.filter(variant=variant)
            metrics_by_variant[variant] = {
                "quality_score": variant_data.mean("quality_score"),
                "user_satisfaction": variant_data.mean("satisfaction"),
                "task_completion": variant_data.mean("completed"),
                "sample_size": len(variant_data)
            }

        winner = self._statistical_analysis(metrics_by_variant)

        return ABTestResult(
            metrics=metrics_by_variant,
            winner=winner,
            confidence=winner.confidence if winner else None
        )

Safety Mechanisms

production_testing_safety:
  guardrails:
    - Maximum canary percentage
    - Automatic rollback triggers
    - Human approval for promotion
    - Rate limiting on new code paths

  monitoring:
    - Real-time quality metrics
    - Error rate dashboards
    - User feedback collection
    - Cost tracking

  rollback:
    - One-click rollback
    - Automatic on threshold breach
    - Clear ownership

Key Takeaways

Pre-production testing is necessary but insufficient
Shadow mode tests without risk
Canary deployments catch issues early
Feature flags enable granular control
A/B testing validates improvements
Always have automatic rollback
Monitor quality, not just errors
Production testing is continuous
Start small, expand carefully

Test in production safely. It’s the only way to know for sure.