Context windows have grown dramatically—128K tokens for GPT-4 Turbo, 200K for Claude 3. But even large windows have limits. Effective context management is the difference between applications that work and applications that fail at scale.
Here’s how to manage context windows in production.
Context Window Landscape
Current Limits
context_windows_2024:
openai:
gpt4_turbo: 128K tokens
gpt4o: 128K tokens
anthropic:
claude_3_opus: 200K tokens
claude_3_sonnet: 200K tokens
claude_35_sonnet: 200K tokens
practical_limits:
note: "Max tokens != optimal tokens"
issues:
- Cost increases linearly with tokens
- Latency increases with context size
- Quality can degrade with irrelevant context
- "Lost in the middle" problem
Token Economics
token_costs:
example: "100K token context"
gpt4_turbo:
input_cost: "$1.00"
per_request: "Adds up quickly"
claude_35_sonnet:
input_cost: "$0.30"
better_but: "Still significant at scale"
optimization_impact:
naive: "100K tokens per request"
optimized: "10-20K tokens per request"
savings: "80-90%"
Context Management Strategies
Hierarchical Summarization
class HierarchicalContext:
"""Maintain context at multiple granularities."""
def __init__(self, llm_client, max_tokens: int = 50000):
self.llm = llm_client
self.max_tokens = max_tokens
# Context levels
self.immediate_context = [] # Last few messages
self.session_summary = "" # Summary of current session
self.long_term_memory = [] # Key facts from history
async def add_message(self, message: Message):
self.immediate_context.append(message)
# Summarize when immediate context gets large
if self._count_tokens(self.immediate_context) > self.max_tokens * 0.4:
await self._summarize_and_compress()
async def _summarize_and_compress(self):
# Keep last few messages verbatim
recent = self.immediate_context[-4:]
older = self.immediate_context[:-4]
if older:
# Summarize older messages
summary = await self._summarize(older)
self.session_summary = self._merge_summaries(
self.session_summary,
summary
)
# Extract key facts for long-term memory
facts = await self._extract_facts(older)
self.long_term_memory.extend(facts)
self.long_term_memory = self._deduplicate(self.long_term_memory)
self.immediate_context = recent
def build_context(self) -> list[Message]:
"""Build context for LLM call."""
context = []
# Add long-term memory as system context
if self.long_term_memory:
context.append(Message(
role="system",
content=f"Key facts: {self._format_facts(self.long_term_memory)}"
))
# Add session summary
if self.session_summary:
context.append(Message(
role="system",
content=f"Earlier in conversation: {self.session_summary}"
))
# Add immediate context
context.extend(self.immediate_context)
return context
Sliding Window with Anchors
class AnchoredSlidingWindow:
"""Keep important messages while sliding recent context."""
def __init__(self, window_size: int = 20, anchor_slots: int = 5):
self.window_size = window_size
self.anchor_slots = anchor_slots
self.anchored_messages = [] # Important messages to always include
self.sliding_window = [] # Recent messages
def add_message(self, message: Message, is_anchor: bool = False):
if is_anchor:
if len(self.anchored_messages) >= self.anchor_slots:
# Replace least important anchor
self._replace_anchor(message)
else:
self.anchored_messages.append(message)
self.sliding_window.append(message)
# Slide window
while len(self.sliding_window) > self.window_size:
removed = self.sliding_window.pop(0)
# Auto-anchor important messages before removal
if self._should_auto_anchor(removed):
self.add_message(removed, is_anchor=True)
def _should_auto_anchor(self, message: Message) -> bool:
"""Detect important messages to preserve."""
importance_signals = [
"important",
"remember",
"key point",
"decision:",
"agreed:"
]
return any(
signal in message.content.lower()
for signal in importance_signals
)
def get_context(self) -> list[Message]:
return self.anchored_messages + self.sliding_window
RAG Integration
class RAGContextManager:
"""Combine RAG with conversation context."""
def __init__(self, vector_store, max_context_tokens: int = 100000):
self.vector_store = vector_store
self.max_tokens = max_context_tokens
self.token_budget = {
"system": 0.1, # 10% for system prompt
"retrieved": 0.4, # 40% for RAG results
"conversation": 0.4, # 40% for conversation
"buffer": 0.1 # 10% buffer
}
async def build_context(
self,
query: str,
conversation: list[Message],
system_prompt: str
) -> list[Message]:
# Calculate token budgets
retrieved_budget = int(self.max_tokens * self.token_budget["retrieved"])
conversation_budget = int(self.max_tokens * self.token_budget["conversation"])
# Retrieve relevant documents
docs = await self.vector_store.search(
query=query,
max_tokens=retrieved_budget
)
# Build retrieval context
retrieval_context = self._format_documents(docs)
# Trim conversation to budget
trimmed_conversation = self._trim_to_budget(
conversation,
conversation_budget
)
# Assemble final context
return [
Message(role="system", content=system_prompt),
Message(role="system", content=f"Relevant information:\n{retrieval_context}"),
*trimmed_conversation
]
def _trim_to_budget(
self,
messages: list[Message],
budget: int
) -> list[Message]:
"""Keep most recent messages within budget."""
result = []
used_tokens = 0
for message in reversed(messages):
msg_tokens = self._count_tokens(message)
if used_tokens + msg_tokens > budget:
break
result.insert(0, message)
used_tokens += msg_tokens
return result
Advanced Patterns
Context Compression
async def compress_context(
self,
messages: list[Message],
target_tokens: int
) -> list[Message]:
"""Intelligently compress context while preserving meaning."""
current_tokens = self._count_tokens(messages)
if current_tokens <= target_tokens:
return messages
compression_ratio = target_tokens / current_tokens
# Strategy 1: Summarize older messages
compressed = await self.llm.chat(
messages=[
{
"role": "system",
"content": f"Compress this conversation to {compression_ratio:.0%} "
f"of its length while preserving all key information."
},
{
"role": "user",
"content": self._format_messages(messages[:-4])
}
]
)
return [
Message(role="system", content=f"Previous conversation summary: {compressed}"),
*messages[-4:] # Keep last 4 messages verbatim
]
Key Takeaways
- Large context windows don’t mean you should use all tokens
- Hierarchical summarization preserves information efficiently
- Anchor important messages to prevent loss
- RAG reduces context requirements dramatically
- Token costs add up—optimize aggressively
- “Lost in the middle” is real—structure matters
- Summarize older context, keep recent verbatim
- Allocate token budgets for different context types
- Compress when approaching limits
- Test with realistic conversation lengths
Context management is engineering. Design it deliberately.