Infrastructure Testing: From Unit to Production

Infrastructure as code changed how we provision systems. But code without tests is just hope. Infrastructure testing ensures your IaC works before it hits production.

Here’s how to test infrastructure at every level.

Testing Pyramid for Infrastructure

                    ┌─────────────┐
                   │  Production  │
                   │   Validation │ (smoke tests, synthetic monitoring)
                  ├───────────────┤
                 │   Integration   │
                │     Tests        │ (real resources, sandbox)
               ├───────────────────┤
              │    Contract Tests   │
             │  (policy, compliance) │
            ├─────────────────────────┤
           │        Unit Tests         │
          │  (syntax, logic, mocking)   │
         └───────────────────────────────┘

More tests at the bottom (fast, cheap), fewer at the top (slow, expensive).

Unit Testing

Terraform Validation

# Built-in validation
terraform validate

# Format check
terraform fmt -check

# Custom validation rules
variable "environment" {
  type = string
  validation {
    condition     = contains(["dev", "staging", "prod"], var.environment)
    error_message = "Environment must be dev, staging, or prod."
  }
}

Terraform with Terratest

// test/vpc_test.go
package test

import (
    "testing"
    "github.com/gruntwork-io/terratest/modules/terraform"
    "github.com/stretchr/testify/assert"
)

func TestVPCCreation(t *testing.T) {
    terraformOptions := &terraform.Options{
        TerraformDir: "../modules/vpc",
        Vars: map[string]interface{}{
            "cidr_block": "10.0.0.0/16",
            "name":       "test-vpc",
        },
    }

    defer terraform.Destroy(t, terraformOptions)

    terraform.InitAndApply(t, terraformOptions)

    vpcId := terraform.Output(t, terraformOptions, "vpc_id")
    assert.NotEmpty(t, vpcId)

    cidr := terraform.Output(t, terraformOptions, "cidr_block")
    assert.Equal(t, "10.0.0.0/16", cidr)
}

Kubernetes Manifest Validation

# kubectl validation
kubectl apply --dry-run=client -f manifests/

# kubeval for schema validation
kubeval --strict manifests/*.yaml

# kubeconform (faster, more maintained)
kubeconform -strict manifests/

Helm Chart Testing

# templates/tests/test-connection.yaml
apiVersion: v1
kind: Pod
metadata:
  name: "{{ .Release.Name }}-test"
  annotations:
    "helm.sh/hook": test
spec:
  containers:
    - name: test
      image: busybox
      command: ['wget']
      args: ['{{ .Release.Name }}-service:80']
  restartPolicy: Never

helm test my-release

Policy and Compliance Testing

Open Policy Agent (OPA)

# policy/terraform.rego
package terraform

deny[msg] {
    resource := input.resource_changes[_]
    resource.type == "aws_s3_bucket"
    not resource.change.after.versioning[0].enabled
    msg := sprintf("S3 bucket %s must have versioning enabled", [resource.name])
}

deny[msg] {
    resource := input.resource_changes[_]
    resource.type == "aws_security_group_rule"
    resource.change.after.cidr_blocks[_] == "0.0.0.0/0"
    resource.change.after.from_port == 22
    msg := "SSH must not be open to the world"
}

# Test against Terraform plan
terraform plan -out=tfplan
terraform show -json tfplan > tfplan.json
opa eval --data policy/ --input tfplan.json "data.terraform.deny"

Conftest

# Using conftest for easier OPA testing
conftest test tfplan.json --policy policy/

# For Kubernetes
conftest test deployment.yaml --policy k8s-policy/

# k8s-policy/deployment.rego
package main

deny[msg] {
    input.kind == "Deployment"
    not input.spec.template.spec.securityContext.runAsNonRoot
    msg := "Containers must run as non-root"
}

deny[msg] {
    input.kind == "Deployment"
    container := input.spec.template.spec.containers[_]
    not container.resources.limits.memory
    msg := sprintf("Container %s must have memory limits", [container.name])
}

Checkov

# Scan Terraform
checkov -d terraform/

# Scan Kubernetes
checkov -d k8s-manifests/

# Custom checks
checkov --external-checks-dir custom_checks/ -d terraform/

Integration Testing

Terratest for Real Resources

func TestEKSCluster(t *testing.T) {
    t.Parallel()

    terraformOptions := &terraform.Options{
        TerraformDir: "../modules/eks",
        Vars: map[string]interface{}{
            "cluster_name":    fmt.Sprintf("test-%s", random.UniqueId()),
            "node_count":      2,
            "instance_type":   "t3.small",
        },
    }

    defer terraform.Destroy(t, terraformOptions)
    terraform.InitAndApply(t, terraformOptions)

    kubeconfig := terraform.Output(t, terraformOptions, "kubeconfig")

    // Test cluster connectivity
    options := k8s.NewKubectlOptions("", kubeconfig, "default")

    // Verify nodes are ready
    k8s.WaitUntilNumNodesReady(t, options, 2, 30, 10*time.Second)

    // Deploy test workload
    k8s.KubectlApply(t, options, "test-deployment.yaml")
    k8s.WaitUntilDeploymentAvailable(t, options, "test-app", 60, 5*time.Second)
}

Kitchen-Terraform

# .kitchen.yml
driver:
  name: terraform
  root_module_directory: test/fixtures/default

provisioner:
  name: terraform

verifier:
  name: terraform
  systems:
    - name: default
      backend: ssh
      controls:
        - instance_checks

platforms:
  - name: aws

suites:
  - name: default

# test/integration/default/controls/instance_checks.rb
control 'instance_checks' do
  describe aws_ec2_instance(name: 'test-instance') do
    it { should exist }
    it { should be_running }
    its('instance_type') { should eq 't3.micro' }
  end
end

Kubernetes Integration Tests

func TestDeploymentIntegration(t *testing.T) {
    // Use kind for local testing
    cluster := kind.CreateCluster("test-cluster")
    defer cluster.Delete()

    kubeconfig := cluster.Kubeconfig()
    clientset := kubernetes.NewForConfig(kubeconfig)

    // Apply manifests
    applyManifests(t, clientset, "manifests/")

    // Wait for deployment
    waitForDeployment(t, clientset, "default", "my-app", 2*time.Minute)

    // Test service connectivity
    svc, _ := clientset.CoreV1().Services("default").Get(ctx, "my-app", metav1.GetOptions{})
    assert.NotNil(t, svc.Spec.ClusterIP)

    // Port forward and test
    resp := httpGet(fmt.Sprintf("http://localhost:%d/health", localPort))
    assert.Equal(t, 200, resp.StatusCode)
}

Production Validation

Smoke Tests

# smoke_test.py
import requests
import sys

def test_health_endpoint():
    response = requests.get(f"{BASE_URL}/health", timeout=5)
    assert response.status_code == 200
    assert response.json()["status"] == "healthy"

def test_database_connectivity():
    response = requests.get(f"{BASE_URL}/health/db", timeout=5)
    assert response.status_code == 200

def test_critical_endpoint():
    response = requests.get(f"{BASE_URL}/api/v1/status", timeout=5)
    assert response.status_code == 200

if __name__ == "__main__":
    try:
        test_health_endpoint()
        test_database_connectivity()
        test_critical_endpoint()
        print("All smoke tests passed")
        sys.exit(0)
    except AssertionError as e:
        print(f"Smoke test failed: {e}")
        sys.exit(1)

Synthetic Monitoring

# Datadog synthetic test
apiVersion: datadog/v1
kind: SyntheticTest
metadata:
  name: api-health
spec:
  type: api
  request:
    url: https://api.example.com/health
    method: GET
  assertions:
    - type: statusCode
      operator: is
      target: 200
    - type: responseTime
      operator: lessThan
      target: 500
  locations:
    - aws:us-east-1
    - aws:eu-west-1
  options:
    tick_every: 60  # seconds

Chaos Testing

# LitmusChaos experiment
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: pod-delete
spec:
  engineState: 'active'
  appinfo:
    appns: 'default'
    applabel: 'app=my-app'
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-delete
      spec:
        components:
          env:
            - name: TOTAL_CHAOS_DURATION
              value: '30'
            - name: CHAOS_INTERVAL
              value: '10'
            - name: FORCE
              value: 'false'

CI/CD Integration

GitHub Actions

name: Infrastructure Tests

on:
  pull_request:
    paths:
      - 'terraform/**'
      - 'k8s/**'

jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2

      - name: Terraform Format
        run: terraform fmt -check -recursive terraform/

      - name: Validate Kubernetes
        run: kubeconform -strict k8s/

  policy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2

      - name: Checkov
        uses: bridgecrewio/checkov-action@master
        with:
          directory: terraform/

      - name: Conftest
        run: conftest test k8s/ --policy policies/

  integration:
    runs-on: ubuntu-latest
    if: github.event.pull_request.draft == false
    steps:
      - uses: actions/checkout@v2

      - name: Setup Go
        uses: actions/setup-go@v2

      - name: Run Terratest
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
        run: |
          cd test
          go test -v -timeout 30m

Best Practices

Test Environment Isolation

# Use workspaces or unique naming
resource "aws_vpc" "main" {
  cidr_block = var.cidr_block

  tags = {
    Name        = "test-${var.unique_id}"
    Environment = "test"
    ManagedBy   = "terratest"
  }
}

Cleanup Automation

// Always defer destroy
defer terraform.Destroy(t, terraformOptions)

// Or use cleanup functions
t.Cleanup(func() {
    terraform.Destroy(t, terraformOptions)
})

Cost Management

// Skip expensive tests locally
func TestExpensiveResource(t *testing.T) {
    if os.Getenv("RUN_EXPENSIVE_TESTS") != "true" {
        t.Skip("Skipping expensive test")
    }
    // ...
}

Key Takeaways

Infrastructure testing follows a pyramid: unit → policy → integration → production
Unit tests (validation, formatting) should run on every commit
Policy tests (OPA, Checkov) catch security and compliance issues early
Integration tests with real resources are essential but expensive
Use tools like Terratest for Terraform, kubeconform for Kubernetes
Smoke tests after deployment catch environmental issues
Chaos testing validates resilience assumptions
Always automate cleanup to avoid resource leaks
Run expensive tests in CI, not on every local change
Treat infrastructure test code with the same rigor as application tests

Infrastructure as code deserves testing as code. The investment prevents costly production incidents.