Infrastructure as code changed how we provision systems. But code without tests is just hope. Infrastructure testing ensures your IaC works before it hits production.
Here’s how to test infrastructure at every level.
Testing Pyramid for Infrastructure
┌─────────────┐
│ Production │
│ Validation │ (smoke tests, synthetic monitoring)
├───────────────┤
│ Integration │
│ Tests │ (real resources, sandbox)
├───────────────────┤
│ Contract Tests │
│ (policy, compliance) │
├─────────────────────────┤
│ Unit Tests │
│ (syntax, logic, mocking) │
└───────────────────────────────┘
More tests at the bottom (fast, cheap), fewer at the top (slow, expensive).
Unit Testing
Terraform Validation
# Built-in validation
terraform validate
# Format check
terraform fmt -check
# Custom validation rules
variable "environment" {
type = string
validation {
condition = contains(["dev", "staging", "prod"], var.environment)
error_message = "Environment must be dev, staging, or prod."
}
}
Terraform with Terratest
// test/vpc_test.go
package test
import (
"testing"
"github.com/gruntwork-io/terratest/modules/terraform"
"github.com/stretchr/testify/assert"
)
func TestVPCCreation(t *testing.T) {
terraformOptions := &terraform.Options{
TerraformDir: "../modules/vpc",
Vars: map[string]interface{}{
"cidr_block": "10.0.0.0/16",
"name": "test-vpc",
},
}
defer terraform.Destroy(t, terraformOptions)
terraform.InitAndApply(t, terraformOptions)
vpcId := terraform.Output(t, terraformOptions, "vpc_id")
assert.NotEmpty(t, vpcId)
cidr := terraform.Output(t, terraformOptions, "cidr_block")
assert.Equal(t, "10.0.0.0/16", cidr)
}
Kubernetes Manifest Validation
# kubectl validation
kubectl apply --dry-run=client -f manifests/
# kubeval for schema validation
kubeval --strict manifests/*.yaml
# kubeconform (faster, more maintained)
kubeconform -strict manifests/
Helm Chart Testing
# templates/tests/test-connection.yaml
apiVersion: v1
kind: Pod
metadata:
name: "{{ .Release.Name }}-test"
annotations:
"helm.sh/hook": test
spec:
containers:
- name: test
image: busybox
command: ['wget']
args: ['{{ .Release.Name }}-service:80']
restartPolicy: Never
helm test my-release
Policy and Compliance Testing
Open Policy Agent (OPA)
# policy/terraform.rego
package terraform
deny[msg] {
resource := input.resource_changes[_]
resource.type == "aws_s3_bucket"
not resource.change.after.versioning[0].enabled
msg := sprintf("S3 bucket %s must have versioning enabled", [resource.name])
}
deny[msg] {
resource := input.resource_changes[_]
resource.type == "aws_security_group_rule"
resource.change.after.cidr_blocks[_] == "0.0.0.0/0"
resource.change.after.from_port == 22
msg := "SSH must not be open to the world"
}
# Test against Terraform plan
terraform plan -out=tfplan
terraform show -json tfplan > tfplan.json
opa eval --data policy/ --input tfplan.json "data.terraform.deny"
Conftest
# Using conftest for easier OPA testing
conftest test tfplan.json --policy policy/
# For Kubernetes
conftest test deployment.yaml --policy k8s-policy/
# k8s-policy/deployment.rego
package main
deny[msg] {
input.kind == "Deployment"
not input.spec.template.spec.securityContext.runAsNonRoot
msg := "Containers must run as non-root"
}
deny[msg] {
input.kind == "Deployment"
container := input.spec.template.spec.containers[_]
not container.resources.limits.memory
msg := sprintf("Container %s must have memory limits", [container.name])
}
Checkov
# Scan Terraform
checkov -d terraform/
# Scan Kubernetes
checkov -d k8s-manifests/
# Custom checks
checkov --external-checks-dir custom_checks/ -d terraform/
Integration Testing
Terratest for Real Resources
func TestEKSCluster(t *testing.T) {
t.Parallel()
terraformOptions := &terraform.Options{
TerraformDir: "../modules/eks",
Vars: map[string]interface{}{
"cluster_name": fmt.Sprintf("test-%s", random.UniqueId()),
"node_count": 2,
"instance_type": "t3.small",
},
}
defer terraform.Destroy(t, terraformOptions)
terraform.InitAndApply(t, terraformOptions)
kubeconfig := terraform.Output(t, terraformOptions, "kubeconfig")
// Test cluster connectivity
options := k8s.NewKubectlOptions("", kubeconfig, "default")
// Verify nodes are ready
k8s.WaitUntilNumNodesReady(t, options, 2, 30, 10*time.Second)
// Deploy test workload
k8s.KubectlApply(t, options, "test-deployment.yaml")
k8s.WaitUntilDeploymentAvailable(t, options, "test-app", 60, 5*time.Second)
}
Kitchen-Terraform
# .kitchen.yml
driver:
name: terraform
root_module_directory: test/fixtures/default
provisioner:
name: terraform
verifier:
name: terraform
systems:
- name: default
backend: ssh
controls:
- instance_checks
platforms:
- name: aws
suites:
- name: default
# test/integration/default/controls/instance_checks.rb
control 'instance_checks' do
describe aws_ec2_instance(name: 'test-instance') do
it { should exist }
it { should be_running }
its('instance_type') { should eq 't3.micro' }
end
end
Kubernetes Integration Tests
func TestDeploymentIntegration(t *testing.T) {
// Use kind for local testing
cluster := kind.CreateCluster("test-cluster")
defer cluster.Delete()
kubeconfig := cluster.Kubeconfig()
clientset := kubernetes.NewForConfig(kubeconfig)
// Apply manifests
applyManifests(t, clientset, "manifests/")
// Wait for deployment
waitForDeployment(t, clientset, "default", "my-app", 2*time.Minute)
// Test service connectivity
svc, _ := clientset.CoreV1().Services("default").Get(ctx, "my-app", metav1.GetOptions{})
assert.NotNil(t, svc.Spec.ClusterIP)
// Port forward and test
resp := httpGet(fmt.Sprintf("http://localhost:%d/health", localPort))
assert.Equal(t, 200, resp.StatusCode)
}
Production Validation
Smoke Tests
# smoke_test.py
import requests
import sys
def test_health_endpoint():
response = requests.get(f"{BASE_URL}/health", timeout=5)
assert response.status_code == 200
assert response.json()["status"] == "healthy"
def test_database_connectivity():
response = requests.get(f"{BASE_URL}/health/db", timeout=5)
assert response.status_code == 200
def test_critical_endpoint():
response = requests.get(f"{BASE_URL}/api/v1/status", timeout=5)
assert response.status_code == 200
if __name__ == "__main__":
try:
test_health_endpoint()
test_database_connectivity()
test_critical_endpoint()
print("All smoke tests passed")
sys.exit(0)
except AssertionError as e:
print(f"Smoke test failed: {e}")
sys.exit(1)
Synthetic Monitoring
# Datadog synthetic test
apiVersion: datadog/v1
kind: SyntheticTest
metadata:
name: api-health
spec:
type: api
request:
url: https://api.example.com/health
method: GET
assertions:
- type: statusCode
operator: is
target: 200
- type: responseTime
operator: lessThan
target: 500
locations:
- aws:us-east-1
- aws:eu-west-1
options:
tick_every: 60 # seconds
Chaos Testing
# LitmusChaos experiment
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
name: pod-delete
spec:
engineState: 'active'
appinfo:
appns: 'default'
applabel: 'app=my-app'
chaosServiceAccount: litmus-admin
experiments:
- name: pod-delete
spec:
components:
env:
- name: TOTAL_CHAOS_DURATION
value: '30'
- name: CHAOS_INTERVAL
value: '10'
- name: FORCE
value: 'false'
CI/CD Integration
GitHub Actions
name: Infrastructure Tests
on:
pull_request:
paths:
- 'terraform/**'
- 'k8s/**'
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Terraform Format
run: terraform fmt -check -recursive terraform/
- name: Validate Kubernetes
run: kubeconform -strict k8s/
policy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Checkov
uses: bridgecrewio/checkov-action@master
with:
directory: terraform/
- name: Conftest
run: conftest test k8s/ --policy policies/
integration:
runs-on: ubuntu-latest
if: github.event.pull_request.draft == false
steps:
- uses: actions/checkout@v2
- name: Setup Go
uses: actions/setup-go@v2
- name: Run Terratest
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
cd test
go test -v -timeout 30m
Best Practices
Test Environment Isolation
# Use workspaces or unique naming
resource "aws_vpc" "main" {
cidr_block = var.cidr_block
tags = {
Name = "test-${var.unique_id}"
Environment = "test"
ManagedBy = "terratest"
}
}
Cleanup Automation
// Always defer destroy
defer terraform.Destroy(t, terraformOptions)
// Or use cleanup functions
t.Cleanup(func() {
terraform.Destroy(t, terraformOptions)
})
Cost Management
// Skip expensive tests locally
func TestExpensiveResource(t *testing.T) {
if os.Getenv("RUN_EXPENSIVE_TESTS") != "true" {
t.Skip("Skipping expensive test")
}
// ...
}
Key Takeaways
- Infrastructure testing follows a pyramid: unit → policy → integration → production
- Unit tests (validation, formatting) should run on every commit
- Policy tests (OPA, Checkov) catch security and compliance issues early
- Integration tests with real resources are essential but expensive
- Use tools like Terratest for Terraform, kubeconform for Kubernetes
- Smoke tests after deployment catch environmental issues
- Chaos testing validates resilience assumptions
- Always automate cleanup to avoid resource leaks
- Run expensive tests in CI, not on every local change
- Treat infrastructure test code with the same rigor as application tests
Infrastructure as code deserves testing as code. The investment prevents costly production incidents.