Caching is often the difference between a system that scales and one that falls over. But caching is also a source of subtle bugs: stale data, cache stampedes, and inconsistency. Good caching requires understanding the patterns and their trade-offs.
Here’s how to cache effectively.
Cache Patterns
Cache-Aside (Lazy Loading)
cache_aside:
pattern: Application manages cache
flow:
read:
1. Check cache
2. If miss, read from database
3. Store in cache
4. Return data
write:
1. Update database
2. Invalidate or update cache
def get_user(user_id):
# Check cache first
cache_key = f"user:{user_id}"
cached = redis.get(cache_key)
if cached:
return json.loads(cached)
# Cache miss - load from database
user = db.query("SELECT * FROM users WHERE id = %s", user_id)
if user:
redis.setex(cache_key, 3600, json.dumps(user)) # 1 hour TTL
return user
def update_user(user_id, data):
# Update database
db.execute("UPDATE users SET ... WHERE id = %s", data, user_id)
# Invalidate cache
redis.delete(f"user:{user_id}")
Write-Through
write_through:
pattern: Cache updated synchronously with database
flow:
write:
1. Write to cache
2. Cache writes to database
3. Return to client
pros:
- Cache always consistent
- Simplified read path
cons:
- Write latency increased
- Cache may hold rarely-read data
Write-Behind (Write-Back)
write_behind:
pattern: Cache writes async to database
flow:
write:
1. Write to cache
2. Return to client immediately
3. Async: Cache writes to database
pros:
- Fast writes
- Batching possible
cons:
- Data loss risk
- Complexity
- Consistency challenges
Invalidation Strategies
TTL-Based
# Simple TTL expiration
redis.setex("user:123", 3600, user_data) # Expires in 1 hour
# Variable TTL based on data characteristics
def get_cache_ttl(data_type):
ttl_config = {
'user_profile': 3600, # 1 hour
'product_catalog': 86400, # 24 hours
'session': 900, # 15 minutes
'search_results': 300, # 5 minutes
}
return ttl_config.get(data_type, 1800)
Event-Based Invalidation
# Invalidate on related events
def handle_user_update(event):
user_id = event['user_id']
# Invalidate directly related caches
redis.delete(f"user:{user_id}")
redis.delete(f"user_profile:{user_id}")
# Invalidate dependent caches
order_ids = get_user_orders(user_id)
for order_id in order_ids:
redis.delete(f"order:{order_id}")
# Pattern: Publish invalidation events
def update_user(user_id, data):
db.execute("UPDATE users SET ...", data, user_id)
event_bus.publish("user.updated", {"user_id": user_id})
Versioned Keys
# Version-based invalidation
def get_cache_key(user_id):
version = redis.get(f"user_version:{user_id}") or 0
return f"user:{user_id}:v{version}"
def invalidate_user_cache(user_id):
redis.incr(f"user_version:{user_id}")
# Old versioned keys naturally become orphaned and expire
Common Problems
Cache Stampede
cache_stampede:
problem: Many requests miss cache simultaneously, all hit database
scenario:
- Popular item's cache expires
- 1000 concurrent requests
- All hit database
- Database overwhelmed
solutions:
- Locking
- Early refresh
- Probabilistic expiration
# Solution 1: Locking (single flight)
def get_with_lock(key, fetch_func, ttl):
cached = redis.get(key)
if cached:
return json.loads(cached)
lock_key = f"lock:{key}"
acquired = redis.set(lock_key, "1", nx=True, ex=5)
if acquired:
try:
data = fetch_func()
redis.setex(key, ttl, json.dumps(data))
return data
finally:
redis.delete(lock_key)
else:
# Wait and retry
time.sleep(0.1)
return get_with_lock(key, fetch_func, ttl)
# Solution 2: Probabilistic early expiration
def get_with_early_refresh(key, fetch_func, ttl):
cached, ttl_remaining = redis.get_with_ttl(key)
if cached:
# Probabilistic early refresh
if ttl_remaining < ttl * 0.1: # < 10% TTL remaining
if random.random() < 0.1: # 10% chance to refresh
refresh_async(key, fetch_func, ttl)
return json.loads(cached)
return fetch_and_cache(key, fetch_func, ttl)
Cache Penetration
cache_penetration:
problem: Requests for non-existent data always hit database
scenario:
- Attacker requests invalid IDs
- Cache always misses
- Every request hits database
solutions:
- Cache null results
- Bloom filter
- Rate limiting
# Solution: Cache null results
def get_user(user_id):
cache_key = f"user:{user_id}"
cached = redis.get(cache_key)
if cached == "NULL":
return None # Cached negative result
if cached:
return json.loads(cached)
user = db.query("SELECT * FROM users WHERE id = %s", user_id)
if user:
redis.setex(cache_key, 3600, json.dumps(user))
else:
redis.setex(cache_key, 300, "NULL") # Cache null, shorter TTL
return user
Cache Inconsistency
cache_inconsistency:
problem: Cache and database have different data
causes:
- Failed invalidation
- Race conditions
- Replication lag
mitigations:
- Short TTL as safety net
- Read-your-writes consistency
- Eventual consistency acceptance
# Read-your-writes pattern
def update_user(user_id, data, session):
db.execute("UPDATE users SET ...", data, user_id)
# Update cache immediately
user = db.query("SELECT * FROM users WHERE id = %s", user_id)
redis.setex(f"user:{user_id}", 3600, json.dumps(user))
# Mark session as having seen latest version
session['user_version'] = user['updated_at']
def get_user(user_id, session):
cached = redis.get(f"user:{user_id}")
if cached:
user = json.loads(cached)
# Check if session expects newer version
if user['updated_at'] >= session.get('user_version', 0):
return user
# Bypass cache if user expects newer data
return db.query("SELECT * FROM users WHERE id = %s", user_id)
Multi-Level Caching
cache_hierarchy:
L1_local:
- In-process memory
- Fastest
- No network
- Limited size, per-instance
L2_distributed:
- Redis/Memcached
- Shared across instances
- Network latency
- Larger capacity
L3_cdn:
- Edge caching
- Geographic distribution
- Public content only
# Multi-level cache implementation
class MultiLevelCache:
def __init__(self):
self.local_cache = {} # LRU with TTL
self.redis = redis.Redis()
def get(self, key):
# L1: Local memory
if key in self.local_cache:
return self.local_cache[key]
# L2: Redis
value = self.redis.get(key)
if value:
self.local_cache[key] = value # Populate L1
return value
return None
def set(self, key, value, ttl):
self.local_cache[key] = value
self.redis.setex(key, ttl, value)
def invalidate(self, key):
self.local_cache.pop(key, None)
self.redis.delete(key)
# Note: Other instances' L1 still have stale data
# Use pub/sub for distributed invalidation
Key Takeaways
- Cache-aside is the most common pattern; application controls cache
- TTL provides a safety net against stale data
- Cache stampede: use locking or probabilistic early refresh
- Cache penetration: cache null results with short TTL
- Accept eventual consistency or implement read-your-writes
- Multi-level caching: local → distributed → CDN
- Invalidation is the hardest part of caching
- Short TTLs reduce complexity at the cost of hit rate
- Monitor cache hit rates, latency, and memory usage
- When in doubt, prefer shorter TTLs and simpler invalidation
There are only two hard things: cache invalidation and naming things.