Observability-Driven Development

June 14, 2021

Observability is often added after systems are built—and it shows. Logs are inconsistent, metrics are missing, and debugging production issues becomes archaeology. Observability-driven development flips this: build observability in from the start.

Here’s how to make observability a core development practice.

The Problem with Bolted-On Observability

What Goes Wrong

common_issues:
  inconsistent_logging:
    - Different formats per service
    - Missing context (request ID, user ID)
    - Inconsistent log levels
    - Important events not logged

  missing_metrics:
    - No business metrics
    - Only infrastructure metrics
    - Missing latency percentiles
    - No error breakdowns

  poor_tracing:
    - Traces don't cross service boundaries
    - Missing span context
    - No correlation with logs/metrics
    - Critical paths not instrumented

  result:
    - Hours to debug production issues
    - Can't answer business questions
    - Reactive instead of proactive

Observability First

Design for Observability

Before writing code, define what you need to observe:

observability_design:
  service: order-service

  key_operations:
    - name: create_order
      success_metrics:
        - orders_created_total
        - order_value_histogram
      failure_metrics:
        - order_creation_failures_total
      latency_targets:
        p50: 100ms
        p99: 500ms
      required_traces:
        - payment_processing
        - inventory_check
        - notification_send

    - name: get_order
      success_metrics:
        - orders_retrieved_total
      latency_targets:
        p50: 20ms
        p99: 100ms

  business_questions:
    - How many orders per hour?
    - What's the average order value?
    - Which payment methods are failing?
    - Where is latency coming from?

The Three Pillars

┌─────────────────────────────────────────────────────────────────┐
│                      Observability                               │
├───────────────────┬───────────────────┬─────────────────────────┤
│       Logs        │      Metrics      │        Traces           │
│                   │                   │                         │
│  What happened    │  How much/many    │  Request flow           │
│  Event details    │  Aggregated data  │  Across services        │
│  Debugging        │  Alerting         │  Latency breakdown      │
│                   │                   │                         │
│  Structured JSON  │  Prometheus       │  OpenTelemetry          │
│  Request context  │  Counters/Gauges  │  Spans and traces       │
│  Correlation IDs  │  Histograms       │  Context propagation    │
└───────────────────┴───────────────────┴─────────────────────────┘

Structured Logging

Log Structure

// Structured logging with context
type Logger struct {
    logger *zap.Logger
}

func (l *Logger) WithContext(ctx context.Context) *zap.Logger {
    fields := []zap.Field{}

    if requestID := ctx.Value("request_id"); requestID != nil {
        fields = append(fields, zap.String("request_id", requestID.(string)))
    }
    if userID := ctx.Value("user_id"); userID != nil {
        fields = append(fields, zap.String("user_id", userID.(string)))
    }
    if traceID := ctx.Value("trace_id"); traceID != nil {
        fields = append(fields, zap.String("trace_id", traceID.(string)))
    }

    return l.logger.With(fields...)
}

// Usage
func (s *OrderService) CreateOrder(ctx context.Context, order Order) error {
    log := s.logger.WithContext(ctx)

    log.Info("creating order",
        zap.String("customer_id", order.CustomerID),
        zap.Float64("total", order.Total),
    )

    if err := s.validate(order); err != nil {
        log.Warn("order validation failed",
            zap.Error(err),
            zap.Any("validation_errors", err.Details()),
        )
        return err
    }

    // Process order...

    log.Info("order created",
        zap.String("order_id", order.ID),
        zap.Duration("processing_time", time.Since(start)),
    )

    return nil
}

Log Levels

log_levels:
  debug:
    - Detailed debugging info
    - Not enabled in production
    - Example: "checking inventory for SKU-123"

  info:
    - Normal operations
    - Key business events
    - Example: "order created", "user logged in"

  warn:
    - Unusual but handled situations
    - Potential problems
    - Example: "retry succeeded after 2 attempts"

  error:
    - Failures that need attention
    - Include error details and context
    - Example: "payment processing failed"

  fatal:
    - Application cannot continue
    - Triggers shutdown
    - Example: "database connection failed"

Metrics

Metric Types

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    // Counter - monotonically increasing
    ordersCreated = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_created_total",
            Help: "Total number of orders created",
        },
        []string{"status", "payment_method"},
    )

    // Histogram - distribution of values
    orderLatency = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "order_creation_duration_seconds",
            Help:    "Order creation latency",
            Buckets: []float64{.01, .05, .1, .25, .5, 1, 2.5, 5},
        },
        []string{"status"},
    )

    // Gauge - current value
    ordersInProgress = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "orders_in_progress",
            Help: "Number of orders currently being processed",
        },
    )

    // Summary - percentiles (calculated client-side)
    orderValue = promauto.NewSummaryVec(
        prometheus.SummaryOpts{
            Name:       "order_value_dollars",
            Help:       "Order values in dollars",
            Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
        },
        []string{"customer_tier"},
    )
)

func (s *OrderService) CreateOrder(ctx context.Context, order Order) error {
    timer := prometheus.NewTimer(orderLatency.WithLabelValues("pending"))
    ordersInProgress.Inc()
    defer ordersInProgress.Dec()

    err := s.processOrder(ctx, order)

    if err != nil {
        ordersCreated.WithLabelValues("failed", order.PaymentMethod).Inc()
        timer.ObserveDuration()
        return err
    }

    ordersCreated.WithLabelValues("success", order.PaymentMethod).Inc()
    orderValue.WithLabelValues(order.CustomerTier).Observe(order.Total)
    timer.ObserveDuration()

    return nil
}

RED and USE Methods

# RED Method - for services
red_metrics:
  rate: requests per second
  errors: failed requests per second
  duration: distribution of latency

# USE Method - for resources
use_metrics:
  utilization: percentage of time busy
  saturation: queue depth / backlog
  errors: error events

# Combined example
service_metrics:
  - http_requests_total (rate)
  - http_request_duration_seconds (duration)
  - http_requests_failed_total (errors)

resource_metrics:
  - cpu_usage_percent (utilization)
  - request_queue_length (saturation)
  - connection_errors_total (errors)

Distributed Tracing

OpenTelemetry Integration

import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
)

var tracer = otel.Tracer("order-service")

func (s *OrderService) CreateOrder(ctx context.Context, order Order) error {
    ctx, span := tracer.Start(ctx, "CreateOrder",
        trace.WithAttributes(
            attribute.String("customer_id", order.CustomerID),
            attribute.Float64("order_total", order.Total),
        ),
    )
    defer span.End()

    // Validate order
    if err := s.validate(ctx, order); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "validation failed")
        return err
    }

    // Process payment (creates child span)
    if err := s.paymentService.Process(ctx, order); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "payment failed")
        return err
    }

    // Reserve inventory (creates child span)
    if err := s.inventoryService.Reserve(ctx, order.Items); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "inventory reservation failed")
        return err
    }

    span.SetAttributes(attribute.String("order_id", order.ID))
    span.SetStatus(codes.Ok, "order created")

    return nil
}

func (s *PaymentService) Process(ctx context.Context, order Order) error {
    ctx, span := tracer.Start(ctx, "ProcessPayment")
    defer span.End()

    // Payment processing...

    return nil
}

Context Propagation

// HTTP client with trace context
func (c *Client) DoRequest(ctx context.Context, req *http.Request) (*http.Response, error) {
    // Inject trace context into headers
    otel.GetTextMapPropagator().Inject(ctx, propagation.HeaderCarrier(req.Header))

    return c.httpClient.Do(req)
}

// HTTP server extracting trace context
func TracingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        ctx := otel.GetTextMapPropagator().Extract(r.Context(),
            propagation.HeaderCarrier(r.Header))

        ctx, span := tracer.Start(ctx, r.URL.Path)
        defer span.End()

        next.ServeHTTP(w, r.WithContext(ctx))
    })
}

Correlation

Connecting Logs, Metrics, and Traces

// Middleware that establishes correlation
func ObservabilityMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        // Get or create request ID
        requestID := r.Header.Get("X-Request-ID")
        if requestID == "" {
            requestID = uuid.New().String()
        }

        // Extract trace context
        ctx := otel.GetTextMapPropagator().Extract(r.Context(),
            propagation.HeaderCarrier(r.Header))

        // Start span
        ctx, span := tracer.Start(ctx, r.URL.Path)
        defer span.End()

        // Add to context
        ctx = context.WithValue(ctx, "request_id", requestID)
        ctx = context.WithValue(ctx, "trace_id", span.SpanContext().TraceID().String())

        // Wrapped response writer for status code
        wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}

        start := time.Now()
        next.ServeHTTP(wrapped, r.WithContext(ctx))
        duration := time.Since(start)

        // Log with correlation
        logger.Info("request completed",
            zap.String("request_id", requestID),
            zap.String("trace_id", span.SpanContext().TraceID().String()),
            zap.String("method", r.Method),
            zap.String("path", r.URL.Path),
            zap.Int("status", wrapped.statusCode),
            zap.Duration("duration", duration),
        )

        // Metrics with exemplar (trace link)
        requestDuration.WithLabelValues(r.Method, r.URL.Path).
            (Observe(duration.Seconds()))
    })
}

Development Workflow

Observability in CI/CD

# GitHub Actions
name: CI

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - name: Run tests
        run: go test ./...

      - name: Check observability coverage
        run: |
          # Verify all handlers have metrics
          ./scripts/check-metrics-coverage.sh

          # Verify structured logging
          ./scripts/check-logging-standards.sh

          # Verify tracing instrumentation
          ./scripts/check-tracing-coverage.sh

Local Development

# docker-compose for local observability stack
version: '3'
services:
  app:
    build: .
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
      - LOG_FORMAT=json

  otel-collector:
    image: otel/opentelemetry-collector:latest
    volumes:
      - ./otel-config.yaml:/etc/otel/config.yaml
    ports:
      - "4317:4317"

  jaeger:
    image: jaegertracing/all-in-one:latest
    ports:
      - "16686:16686"

  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"

Key Takeaways

Observability isn’t a feature—it’s a capability that enables everything else. Build it in from the start.