OpenTelemetry Adoption: Unified Observability

November 15, 2021

Observability instrumentation has been fragmented: different libraries for tracing (Jaeger, Zipkin), metrics (Prometheus, StatsD), and logging. OpenTelemetry unifies this with a single standard for collecting telemetry data. Adoption is accelerating.

Here’s how to adopt OpenTelemetry effectively.

Why OpenTelemetry?

The Fragmentation Problem

before_opentelemetry:
  tracing:
    - OpenTracing (deprecated)
    - OpenCensus (merged into OTel)
    - Jaeger client
    - Zipkin client
    - Vendor-specific (Datadog, New Relic)

  metrics:
    - Prometheus client
    - StatsD
    - Micrometer
    - Vendor-specific

  logging:
    - Language-specific (log4j, logrus, zap)
    - No correlation with traces

  problems:
    - Vendor lock-in
    - Different instrumentation per backend
    - Difficult to switch vendors
    - No unified context

OpenTelemetry Solution

opentelemetry_approach:
  unified_api:
    - Single API for traces, metrics, logs
    - Language-specific SDKs
    - Consistent across languages

  vendor_neutral:
    - Export to any backend
    - Switch backends without code changes
    - Multi-backend support

  semantic_conventions:
    - Standardized attribute names
    - Consistent across services
    - Better correlation

  context_propagation:
    - Automatic context passing
    - Trace ID in logs
    - Correlated telemetry

Core Components

Architecture

┌─────────────────────────────────────────────────────────────────┐
│                        Application                               │
├─────────────────────────────────────────────────────────────────┤
│                    OpenTelemetry SDK                             │
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐              │
│  │   Tracer    │  │   Meter     │  │   Logger    │              │
│  └─────────────┘  └─────────────┘  └─────────────┘              │
├─────────────────────────────────────────────────────────────────┤
│                    Exporters                                     │
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐              │
│  │    OTLP     │  │  Prometheus │  │   Jaeger    │              │
│  └─────────────┘  └─────────────┘  └─────────────┘              │
└───────────────────────────┬─────────────────────────────────────┘
                            │
                            ▼
┌─────────────────────────────────────────────────────────────────┐
│                  OpenTelemetry Collector                         │
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐              │
│  │  Receivers  │──│  Processors │──│  Exporters  │              │
│  └─────────────┘  └─────────────┘  └─────────────┘              │
└───────────────────────────┬─────────────────────────────────────┘
                            │
        ┌───────────────────┼───────────────────┐
        ▼                   ▼                   ▼
   ┌─────────┐        ┌─────────┐        ┌─────────┐
   │ Jaeger  │        │Prometheus│       │ Vendor  │
   └─────────┘        └─────────┘        └─────────┘

Instrumentation

// Go instrumentation example
import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
    "go.opentelemetry.io/otel/metric"
)

var tracer = otel.Tracer("order-service")
var meter = otel.Meter("order-service")

// Metrics
var (
    ordersCounter, _ = meter.Int64Counter(
        "orders.created",
        metric.WithDescription("Number of orders created"),
    )

    orderDuration, _ = meter.Float64Histogram(
        "orders.duration",
        metric.WithDescription("Order processing duration"),
        metric.WithUnit("ms"),
    )
)

func CreateOrder(ctx context.Context, order Order) error {
    // Start span
    ctx, span := tracer.Start(ctx, "CreateOrder",
        trace.WithAttributes(
            attribute.String("customer.id", order.CustomerID),
            attribute.Float64("order.total", order.Total),
        ),
    )
    defer span.End()

    start := time.Now()

    // Business logic
    if err := validateOrder(ctx, order); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "validation failed")
        return err
    }

    // ... more processing

    // Record metrics
    ordersCounter.Add(ctx, 1,
        metric.WithAttributes(
            attribute.String("status", "success"),
        ),
    )
    orderDuration.Record(ctx, float64(time.Since(start).Milliseconds()))

    span.SetStatus(codes.Ok, "")
    return nil
}

SDK Setup

// Initialize OpenTelemetry
func initTelemetry() (func(), error) {
    ctx := context.Background()

    // Resource describes the service
    res, err := resource.New(ctx,
        resource.WithAttributes(
            semconv.ServiceName("order-service"),
            semconv.ServiceVersion("1.0.0"),
            semconv.DeploymentEnvironment("production"),
        ),
    )
    if err != nil {
        return nil, err
    }

    // Trace exporter
    traceExporter, err := otlptracegrpc.New(ctx,
        otlptracegrpc.WithEndpoint("otel-collector:4317"),
        otlptracegrpc.WithInsecure(),
    )
    if err != nil {
        return nil, err
    }

    // Trace provider
    tp := trace.NewTracerProvider(
        trace.WithBatcher(traceExporter),
        trace.WithResource(res),
        trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(0.1))),
    )
    otel.SetTracerProvider(tp)

    // Metric exporter
    metricExporter, err := otlpmetricgrpc.New(ctx,
        otlpmetricgrpc.WithEndpoint("otel-collector:4317"),
        otlpmetricgrpc.WithInsecure(),
    )
    if err != nil {
        return nil, err
    }

    // Meter provider
    mp := metric.NewMeterProvider(
        metric.WithReader(metric.NewPeriodicReader(metricExporter)),
        metric.WithResource(res),
    )
    otel.SetMeterProvider(mp)

    // Propagator for distributed tracing
    otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
        propagation.TraceContext{},
        propagation.Baggage{},
    ))

    // Cleanup function
    return func() {
        tp.Shutdown(ctx)
        mp.Shutdown(ctx)
    }, nil
}

Collector

Configuration

# otel-collector-config.yaml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

  prometheus:
    config:
      scrape_configs:
        - job_name: 'kubernetes-pods'
          kubernetes_sd_configs:
            - role: pod

processors:
  batch:
    timeout: 1s
    send_batch_size: 1024

  memory_limiter:
    check_interval: 1s
    limit_mib: 1000
    spike_limit_mib: 200

  resource:
    attributes:
      - key: environment
        value: production
        action: upsert

  filter:
    spans:
      exclude:
        match_type: strict
        attributes:
          - key: http.target
            value: /health

exporters:
  jaeger:
    endpoint: jaeger:14250
    tls:
      insecure: true

  prometheus:
    endpoint: 0.0.0.0:8889

  otlp:
    endpoint: vendor-endpoint:443

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [memory_limiter, batch, resource, filter]
      exporters: [jaeger, otlp]

    metrics:
      receivers: [otlp, prometheus]
      processors: [memory_limiter, batch, resource]
      exporters: [prometheus, otlp]

Deployment

# Kubernetes deployment
apiVersion: apps/v1
kind: Deployment
metadata:
  name: otel-collector
spec:
  replicas: 2
  template:
    spec:
      containers:
        - name: collector
          image: otel/opentelemetry-collector-contrib:latest
          args:
            - --config=/etc/otel/config.yaml
          ports:
            - containerPort: 4317  # OTLP gRPC
            - containerPort: 4318  # OTLP HTTP
            - containerPort: 8889  # Prometheus metrics
          volumeMounts:
            - name: config
              mountPath: /etc/otel
          resources:
            requests:
              cpu: 200m
              memory: 256Mi
            limits:
              cpu: 1000m
              memory: 1Gi
      volumes:
        - name: config
          configMap:
            name: otel-collector-config

Auto-Instrumentation

Java Agent

# Download agent
curl -L -o opentelemetry-javaagent.jar \
  https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.17.0/opentelemetry-javaagent.jar

# Run with agent
java -javaagent:opentelemetry-javaagent.jar \
  -Dotel.service.name=my-service \
  -Dotel.exporter.otlp.endpoint=http://collector:4317 \
  -jar my-app.jar

Python

# pip install opentelemetry-distro opentelemetry-exporter-otlp
# opentelemetry-bootstrap -a install

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor

# Setup
provider = TracerProvider()
processor = BatchSpanProcessor(OTLPSpanExporter(endpoint="http://collector:4317"))
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)

# Auto-instrument
FlaskInstrumentor().instrument()
RequestsInstrumentor().instrument()

# Manual instrumentation where needed
tracer = trace.get_tracer(__name__)

@app.route('/process')
def process():
    with tracer.start_as_current_span("process_request") as span:
        span.set_attribute("custom.key", "value")
        # ... business logic

Migration Strategy

Phased Approach

migration_phases:
  phase_1_collector:
    duration: 2-4 weeks
    actions:
      - Deploy OTel Collector
      - Receive existing formats (Jaeger, Prometheus)
      - Export to existing backends
    benefit: Centralized telemetry pipeline

  phase_2_new_services:
    duration: Ongoing
    actions:
      - New services use OTel SDK
      - Auto-instrumentation where possible
      - Standard semantic conventions
    benefit: Future-proof instrumentation

  phase_3_migrate_existing:
    duration: 3-6 months
    actions:
      - Prioritize critical services
      - Replace vendor SDKs with OTel
      - Remove old instrumentation
    benefit: Unified instrumentation

  phase_4_optimize:
    duration: Ongoing
    actions:
      - Tune sampling
      - Optimize collector pipeline
      - Add custom instrumentation
    benefit: Production-ready observability

Key Takeaways

OpenTelemetry is becoming the standard. The time to adopt is now.