Distributed systems introduce complexity that doesn’t exist in monoliths: network failures, partial failures, ordering issues, and consistency challenges. These patterns have been battle-tested and actually work in production.
Here are the distributed systems patterns that matter.
Handling Failure
Circuit Breaker
circuit_breaker:
purpose: Prevent cascade failures
states:
closed: Normal operation, requests pass through
open: Failures exceeded threshold, requests fail fast
half_open: Test if service recovered
benefits:
- Fail fast instead of waiting
- Give failing service time to recover
- Prevent resource exhaustion
type CircuitBreaker struct {
failures int
threshold int
state State
lastFailure time.Time
timeout time.Duration
mu sync.Mutex
}
func (cb *CircuitBreaker) Execute(fn func() error) error {
cb.mu.Lock()
defer cb.mu.Unlock()
switch cb.state {
case Open:
if time.Since(cb.lastFailure) > cb.timeout {
cb.state = HalfOpen
} else {
return ErrCircuitOpen
}
case HalfOpen:
// Allow one request through
}
err := fn()
if err != nil {
cb.failures++
cb.lastFailure = time.Now()
if cb.failures >= cb.threshold {
cb.state = Open
}
return err
}
// Success resets
cb.failures = 0
cb.state = Closed
return nil
}
Retry with Exponential Backoff
func RetryWithBackoff(fn func() error, maxRetries int) error {
var err error
for i := 0; i < maxRetries; i++ {
err = fn()
if err == nil {
return nil
}
if !isRetryable(err) {
return err
}
// Exponential backoff with jitter
backoff := time.Duration(math.Pow(2, float64(i))) * 100 * time.Millisecond
jitter := time.Duration(rand.Int63n(int64(backoff / 2)))
time.Sleep(backoff + jitter)
}
return fmt.Errorf("max retries exceeded: %w", err)
}
func isRetryable(err error) bool {
// Retry on transient errors
return errors.Is(err, ErrTimeout) ||
errors.Is(err, ErrServiceUnavailable)
// Don't retry on: validation errors, auth errors
}
Bulkhead
// Isolate failures by limiting concurrent operations
type Bulkhead struct {
sem chan struct{}
}
func NewBulkhead(maxConcurrent int) *Bulkhead {
return &Bulkhead{
sem: make(chan struct{}, maxConcurrent),
}
}
func (b *Bulkhead) Execute(fn func() error) error {
select {
case b.sem <- struct{}{}:
defer func() { <-b.sem }()
return fn()
default:
return ErrBulkheadFull
}
}
// Usage: Different bulkheads for different services
var (
paymentBulkhead = NewBulkhead(10)
inventoryBulkhead = NewBulkhead(20)
)
Consistency Patterns
Saga Pattern
saga_pattern:
purpose: Distributed transactions across services
approach: Sequence of local transactions with compensations
example:
create_order:
- Reserve inventory (compensate: release)
- Charge payment (compensate: refund)
- Create shipment (compensate: cancel)
- Confirm order
types:
choreography: Events trigger next step
orchestration: Central coordinator manages flow
// Orchestration-based saga
type OrderSaga struct {
inventoryService InventoryService
paymentService PaymentService
shippingService ShippingService
}
func (s *OrderSaga) Execute(order Order) error {
// Step 1: Reserve inventory
reservationID, err := s.inventoryService.Reserve(order.Items)
if err != nil {
return fmt.Errorf("inventory reservation failed: %w", err)
}
// Step 2: Charge payment
paymentID, err := s.paymentService.Charge(order.CustomerID, order.Total)
if err != nil {
// Compensate step 1
s.inventoryService.Release(reservationID)
return fmt.Errorf("payment failed: %w", err)
}
// Step 3: Create shipment
shipmentID, err := s.shippingService.Create(order)
if err != nil {
// Compensate steps 1 and 2
s.paymentService.Refund(paymentID)
s.inventoryService.Release(reservationID)
return fmt.Errorf("shipment creation failed: %w", err)
}
return nil
}
Outbox Pattern
outbox_pattern:
purpose: Reliable event publishing with database transactions
problem: |
Updating DB and publishing event must both succeed.
If event publish fails after DB commit, data is inconsistent.
solution:
- Write event to outbox table in same transaction
- Background process publishes and deletes from outbox
- Guaranteed at-least-once delivery
-- Outbox table
CREATE TABLE outbox (
id SERIAL PRIMARY KEY,
aggregate_type VARCHAR(255),
aggregate_id VARCHAR(255),
event_type VARCHAR(255),
payload JSONB,
created_at TIMESTAMP DEFAULT NOW(),
published_at TIMESTAMP NULL
);
-- In transaction
BEGIN;
UPDATE orders SET status = 'confirmed' WHERE id = $1;
INSERT INTO outbox (aggregate_type, aggregate_id, event_type, payload)
VALUES ('Order', $1, 'OrderConfirmed', $2);
COMMIT;
// Outbox processor
func (p *OutboxProcessor) Process(ctx context.Context) {
ticker := time.NewTicker(100 * time.Millisecond)
for {
select {
case <-ticker.C:
events, err := p.repo.GetUnpublished(ctx, 100)
if err != nil {
continue
}
for _, event := range events {
if err := p.publisher.Publish(ctx, event); err != nil {
continue
}
p.repo.MarkPublished(ctx, event.ID)
}
case <-ctx.Done():
return
}
}
}
Idempotency
Idempotent Operations
idempotency:
why: Messages may be delivered multiple times
principle: Same request produces same result
techniques:
idempotency_keys: Client provides unique key
natural_idempotency: Operation is inherently idempotent
deduplication: Track processed requests
// Idempotency with client-provided key
func (s *PaymentService) ProcessPayment(ctx context.Context, req PaymentRequest) (*PaymentResult, error) {
// Check if already processed
existing, err := s.repo.GetByIdempotencyKey(ctx, req.IdempotencyKey)
if err == nil && existing != nil {
return existing, nil // Return cached result
}
// Process payment
result, err := s.processNewPayment(ctx, req)
if err != nil {
return nil, err
}
// Store result with idempotency key
result.IdempotencyKey = req.IdempotencyKey
s.repo.Save(ctx, result)
return result, nil
}
Communication Patterns
Event-Driven
event_driven:
benefits:
- Loose coupling
- Temporal decoupling
- Easy to add consumers
patterns:
event_notification: Minimal data, consumer fetches more
event_carried_state: Full data in event
event_sourcing: Events are source of truth
// Event publisher
type OrderCreatedEvent struct {
OrderID string `json:"orderId"`
CustomerID string `json:"customerId"`
Total float64 `json:"total"`
CreatedAt time.Time `json:"createdAt"`
}
func (s *OrderService) CreateOrder(ctx context.Context, order Order) error {
// Save order
if err := s.repo.Save(ctx, &order); err != nil {
return err
}
// Publish event
event := OrderCreatedEvent{
OrderID: order.ID,
CustomerID: order.CustomerID,
Total: order.Total,
CreatedAt: time.Now(),
}
return s.eventBus.Publish(ctx, "orders.created", event)
}
Request-Reply
// Synchronous over messaging
func (c *Client) RequestReply(ctx context.Context, request []byte) ([]byte, error) {
replyTo := fmt.Sprintf("replies.%s", uuid.New().String())
// Subscribe to reply channel
replies := make(chan []byte, 1)
sub, _ := c.nats.Subscribe(replyTo, func(msg *nats.Msg) {
replies <- msg.Data
})
defer sub.Unsubscribe()
// Send request with reply-to
c.nats.PublishRequest("service.request", replyTo, request)
// Wait for reply or timeout
select {
case reply := <-replies:
return reply, nil
case <-ctx.Done():
return nil, ctx.Err()
}
}
Scaling Patterns
Sharding
// Consistent hashing for sharding
type ConsistentHash struct {
ring map[uint32]string
keys []uint32
nodes map[string]bool
vnodes int
}
func (c *ConsistentHash) GetNode(key string) string {
hash := c.hash(key)
idx := sort.Search(len(c.keys), func(i int) bool {
return c.keys[i] >= hash
})
if idx == len(c.keys) {
idx = 0
}
return c.ring[c.keys[idx]]
}
// Usage
func (s *Service) GetShard(userID string) *Shard {
node := s.consistentHash.GetNode(userID)
return s.shards[node]
}
Leader Election
// Leader election with etcd
func (s *Service) ElectLeader(ctx context.Context) error {
session, err := concurrency.NewSession(s.etcdClient)
if err != nil {
return err
}
defer session.Close()
election := concurrency.NewElection(session, "/service/leader")
// Campaign to become leader
if err := election.Campaign(ctx, s.nodeID); err != nil {
return err
}
// We are now leader
s.isLeader = true
defer func() {
s.isLeader = false
election.Resign(ctx)
}()
// Do leader work until context cancelled
return s.runAsLeader(ctx)
}
Key Takeaways
- Circuit breakers prevent cascade failures
- Retry with exponential backoff and jitter prevents thundering herds
- Bulkheads isolate failures between components
- Sagas handle distributed transactions with compensations
- Outbox pattern ensures reliable event publishing
- Always design for idempotency; messages will be duplicated
- Event-driven architecture enables loose coupling
- Consistent hashing distributes load evenly
- Leader election coordinates singleton responsibilities
- Assume failure at every network boundary
These patterns exist because distributed systems fail in unexpected ways. Use them.