Distributed Systems Patterns That Actually Work

May 30, 2022

Distributed systems introduce complexity that doesn’t exist in monoliths: network failures, partial failures, ordering issues, and consistency challenges. These patterns have been battle-tested and actually work in production.

Here are the distributed systems patterns that matter.

Handling Failure

Circuit Breaker

circuit_breaker:
  purpose: Prevent cascade failures
  states:
    closed: Normal operation, requests pass through
    open: Failures exceeded threshold, requests fail fast
    half_open: Test if service recovered

  benefits:
    - Fail fast instead of waiting
    - Give failing service time to recover
    - Prevent resource exhaustion
type CircuitBreaker struct {
    failures    int
    threshold   int
    state       State
    lastFailure time.Time
    timeout     time.Duration
    mu          sync.Mutex
}

func (cb *CircuitBreaker) Execute(fn func() error) error {
    cb.mu.Lock()
    defer cb.mu.Unlock()

    switch cb.state {
    case Open:
        if time.Since(cb.lastFailure) > cb.timeout {
            cb.state = HalfOpen
        } else {
            return ErrCircuitOpen
        }
    case HalfOpen:
        // Allow one request through
    }

    err := fn()

    if err != nil {
        cb.failures++
        cb.lastFailure = time.Now()
        if cb.failures >= cb.threshold {
            cb.state = Open
        }
        return err
    }

    // Success resets
    cb.failures = 0
    cb.state = Closed
    return nil
}

Retry with Exponential Backoff

func RetryWithBackoff(fn func() error, maxRetries int) error {
    var err error
    for i := 0; i < maxRetries; i++ {
        err = fn()
        if err == nil {
            return nil
        }

        if !isRetryable(err) {
            return err
        }

        // Exponential backoff with jitter
        backoff := time.Duration(math.Pow(2, float64(i))) * 100 * time.Millisecond
        jitter := time.Duration(rand.Int63n(int64(backoff / 2)))
        time.Sleep(backoff + jitter)
    }
    return fmt.Errorf("max retries exceeded: %w", err)
}

func isRetryable(err error) bool {
    // Retry on transient errors
    return errors.Is(err, ErrTimeout) ||
           errors.Is(err, ErrServiceUnavailable)
    // Don't retry on: validation errors, auth errors
}

Bulkhead

// Isolate failures by limiting concurrent operations
type Bulkhead struct {
    sem chan struct{}
}

func NewBulkhead(maxConcurrent int) *Bulkhead {
    return &Bulkhead{
        sem: make(chan struct{}, maxConcurrent),
    }
}

func (b *Bulkhead) Execute(fn func() error) error {
    select {
    case b.sem <- struct{}{}:
        defer func() { <-b.sem }()
        return fn()
    default:
        return ErrBulkheadFull
    }
}

// Usage: Different bulkheads for different services
var (
    paymentBulkhead  = NewBulkhead(10)
    inventoryBulkhead = NewBulkhead(20)
)

Consistency Patterns

Saga Pattern

saga_pattern:
  purpose: Distributed transactions across services
  approach: Sequence of local transactions with compensations

  example:
    create_order:
      - Reserve inventory (compensate: release)
      - Charge payment (compensate: refund)
      - Create shipment (compensate: cancel)
      - Confirm order

  types:
    choreography: Events trigger next step
    orchestration: Central coordinator manages flow
// Orchestration-based saga
type OrderSaga struct {
    inventoryService InventoryService
    paymentService   PaymentService
    shippingService  ShippingService
}

func (s *OrderSaga) Execute(order Order) error {
    // Step 1: Reserve inventory
    reservationID, err := s.inventoryService.Reserve(order.Items)
    if err != nil {
        return fmt.Errorf("inventory reservation failed: %w", err)
    }

    // Step 2: Charge payment
    paymentID, err := s.paymentService.Charge(order.CustomerID, order.Total)
    if err != nil {
        // Compensate step 1
        s.inventoryService.Release(reservationID)
        return fmt.Errorf("payment failed: %w", err)
    }

    // Step 3: Create shipment
    shipmentID, err := s.shippingService.Create(order)
    if err != nil {
        // Compensate steps 1 and 2
        s.paymentService.Refund(paymentID)
        s.inventoryService.Release(reservationID)
        return fmt.Errorf("shipment creation failed: %w", err)
    }

    return nil
}

Outbox Pattern

outbox_pattern:
  purpose: Reliable event publishing with database transactions
  problem: |
    Updating DB and publishing event must both succeed.
    If event publish fails after DB commit, data is inconsistent.

  solution:
    - Write event to outbox table in same transaction
    - Background process publishes and deletes from outbox
    - Guaranteed at-least-once delivery
-- Outbox table
CREATE TABLE outbox (
    id SERIAL PRIMARY KEY,
    aggregate_type VARCHAR(255),
    aggregate_id VARCHAR(255),
    event_type VARCHAR(255),
    payload JSONB,
    created_at TIMESTAMP DEFAULT NOW(),
    published_at TIMESTAMP NULL
);

-- In transaction
BEGIN;
UPDATE orders SET status = 'confirmed' WHERE id = $1;
INSERT INTO outbox (aggregate_type, aggregate_id, event_type, payload)
VALUES ('Order', $1, 'OrderConfirmed', $2);
COMMIT;
// Outbox processor
func (p *OutboxProcessor) Process(ctx context.Context) {
    ticker := time.NewTicker(100 * time.Millisecond)
    for {
        select {
        case <-ticker.C:
            events, err := p.repo.GetUnpublished(ctx, 100)
            if err != nil {
                continue
            }

            for _, event := range events {
                if err := p.publisher.Publish(ctx, event); err != nil {
                    continue
                }
                p.repo.MarkPublished(ctx, event.ID)
            }

        case <-ctx.Done():
            return
        }
    }
}

Idempotency

Idempotent Operations

idempotency:
  why: Messages may be delivered multiple times
  principle: Same request produces same result

  techniques:
    idempotency_keys: Client provides unique key
    natural_idempotency: Operation is inherently idempotent
    deduplication: Track processed requests
// Idempotency with client-provided key
func (s *PaymentService) ProcessPayment(ctx context.Context, req PaymentRequest) (*PaymentResult, error) {
    // Check if already processed
    existing, err := s.repo.GetByIdempotencyKey(ctx, req.IdempotencyKey)
    if err == nil && existing != nil {
        return existing, nil  // Return cached result
    }

    // Process payment
    result, err := s.processNewPayment(ctx, req)
    if err != nil {
        return nil, err
    }

    // Store result with idempotency key
    result.IdempotencyKey = req.IdempotencyKey
    s.repo.Save(ctx, result)

    return result, nil
}

Communication Patterns

Event-Driven

event_driven:
  benefits:
    - Loose coupling
    - Temporal decoupling
    - Easy to add consumers

  patterns:
    event_notification: Minimal data, consumer fetches more
    event_carried_state: Full data in event
    event_sourcing: Events are source of truth
// Event publisher
type OrderCreatedEvent struct {
    OrderID    string    `json:"orderId"`
    CustomerID string    `json:"customerId"`
    Total      float64   `json:"total"`
    CreatedAt  time.Time `json:"createdAt"`
}

func (s *OrderService) CreateOrder(ctx context.Context, order Order) error {
    // Save order
    if err := s.repo.Save(ctx, &order); err != nil {
        return err
    }

    // Publish event
    event := OrderCreatedEvent{
        OrderID:    order.ID,
        CustomerID: order.CustomerID,
        Total:      order.Total,
        CreatedAt:  time.Now(),
    }

    return s.eventBus.Publish(ctx, "orders.created", event)
}

Request-Reply

// Synchronous over messaging
func (c *Client) RequestReply(ctx context.Context, request []byte) ([]byte, error) {
    replyTo := fmt.Sprintf("replies.%s", uuid.New().String())

    // Subscribe to reply channel
    replies := make(chan []byte, 1)
    sub, _ := c.nats.Subscribe(replyTo, func(msg *nats.Msg) {
        replies <- msg.Data
    })
    defer sub.Unsubscribe()

    // Send request with reply-to
    c.nats.PublishRequest("service.request", replyTo, request)

    // Wait for reply or timeout
    select {
    case reply := <-replies:
        return reply, nil
    case <-ctx.Done():
        return nil, ctx.Err()
    }
}

Scaling Patterns

Sharding

// Consistent hashing for sharding
type ConsistentHash struct {
    ring   map[uint32]string
    keys   []uint32
    nodes  map[string]bool
    vnodes int
}

func (c *ConsistentHash) GetNode(key string) string {
    hash := c.hash(key)
    idx := sort.Search(len(c.keys), func(i int) bool {
        return c.keys[i] >= hash
    })
    if idx == len(c.keys) {
        idx = 0
    }
    return c.ring[c.keys[idx]]
}

// Usage
func (s *Service) GetShard(userID string) *Shard {
    node := s.consistentHash.GetNode(userID)
    return s.shards[node]
}

Leader Election

// Leader election with etcd
func (s *Service) ElectLeader(ctx context.Context) error {
    session, err := concurrency.NewSession(s.etcdClient)
    if err != nil {
        return err
    }
    defer session.Close()

    election := concurrency.NewElection(session, "/service/leader")

    // Campaign to become leader
    if err := election.Campaign(ctx, s.nodeID); err != nil {
        return err
    }

    // We are now leader
    s.isLeader = true
    defer func() {
        s.isLeader = false
        election.Resign(ctx)
    }()

    // Do leader work until context cancelled
    return s.runAsLeader(ctx)
}

Key Takeaways

These patterns exist because distributed systems fail in unexpected ways. Use them.