Context Window Strategies for Large Applications

July 22, 2024

Context windows have grown dramatically—128K tokens for GPT-4 Turbo, 200K for Claude 3. But even large windows have limits. Effective context management is the difference between applications that work and applications that fail at scale.

Here’s how to manage context windows in production.

Context Window Landscape

Current Limits

context_windows_2024:
  openai:
    gpt4_turbo: 128K tokens
    gpt4o: 128K tokens

  anthropic:
    claude_3_opus: 200K tokens
    claude_3_sonnet: 200K tokens
    claude_35_sonnet: 200K tokens

  practical_limits:
    note: "Max tokens != optimal tokens"
    issues:
      - Cost increases linearly with tokens
      - Latency increases with context size
      - Quality can degrade with irrelevant context
      - "Lost in the middle" problem

Token Economics

token_costs:
  example: "100K token context"

  gpt4_turbo:
    input_cost: "$1.00"
    per_request: "Adds up quickly"

  claude_35_sonnet:
    input_cost: "$0.30"
    better_but: "Still significant at scale"

  optimization_impact:
    naive: "100K tokens per request"
    optimized: "10-20K tokens per request"
    savings: "80-90%"

Context Management Strategies

Hierarchical Summarization

class HierarchicalContext:
    """Maintain context at multiple granularities."""

    def __init__(self, llm_client, max_tokens: int = 50000):
        self.llm = llm_client
        self.max_tokens = max_tokens

        # Context levels
        self.immediate_context = []  # Last few messages
        self.session_summary = ""    # Summary of current session
        self.long_term_memory = []   # Key facts from history

    async def add_message(self, message: Message):
        self.immediate_context.append(message)

        # Summarize when immediate context gets large
        if self._count_tokens(self.immediate_context) > self.max_tokens * 0.4:
            await self._summarize_and_compress()

    async def _summarize_and_compress(self):
        # Keep last few messages verbatim
        recent = self.immediate_context[-4:]
        older = self.immediate_context[:-4]

        if older:
            # Summarize older messages
            summary = await self._summarize(older)
            self.session_summary = self._merge_summaries(
                self.session_summary,
                summary
            )

            # Extract key facts for long-term memory
            facts = await self._extract_facts(older)
            self.long_term_memory.extend(facts)
            self.long_term_memory = self._deduplicate(self.long_term_memory)

        self.immediate_context = recent

    def build_context(self) -> list[Message]:
        """Build context for LLM call."""
        context = []

        # Add long-term memory as system context
        if self.long_term_memory:
            context.append(Message(
                role="system",
                content=f"Key facts: {self._format_facts(self.long_term_memory)}"
            ))

        # Add session summary
        if self.session_summary:
            context.append(Message(
                role="system",
                content=f"Earlier in conversation: {self.session_summary}"
            ))

        # Add immediate context
        context.extend(self.immediate_context)

        return context

Sliding Window with Anchors

class AnchoredSlidingWindow:
    """Keep important messages while sliding recent context."""

    def __init__(self, window_size: int = 20, anchor_slots: int = 5):
        self.window_size = window_size
        self.anchor_slots = anchor_slots
        self.anchored_messages = []  # Important messages to always include
        self.sliding_window = []     # Recent messages

    def add_message(self, message: Message, is_anchor: bool = False):
        if is_anchor:
            if len(self.anchored_messages) >= self.anchor_slots:
                # Replace least important anchor
                self._replace_anchor(message)
            else:
                self.anchored_messages.append(message)

        self.sliding_window.append(message)

        # Slide window
        while len(self.sliding_window) > self.window_size:
            removed = self.sliding_window.pop(0)
            # Auto-anchor important messages before removal
            if self._should_auto_anchor(removed):
                self.add_message(removed, is_anchor=True)

    def _should_auto_anchor(self, message: Message) -> bool:
        """Detect important messages to preserve."""
        importance_signals = [
            "important",
            "remember",
            "key point",
            "decision:",
            "agreed:"
        ]
        return any(
            signal in message.content.lower()
            for signal in importance_signals
        )

    def get_context(self) -> list[Message]:
        return self.anchored_messages + self.sliding_window

RAG Integration

class RAGContextManager:
    """Combine RAG with conversation context."""

    def __init__(self, vector_store, max_context_tokens: int = 100000):
        self.vector_store = vector_store
        self.max_tokens = max_context_tokens
        self.token_budget = {
            "system": 0.1,      # 10% for system prompt
            "retrieved": 0.4,   # 40% for RAG results
            "conversation": 0.4, # 40% for conversation
            "buffer": 0.1       # 10% buffer
        }

    async def build_context(
        self,
        query: str,
        conversation: list[Message],
        system_prompt: str
    ) -> list[Message]:
        # Calculate token budgets
        retrieved_budget = int(self.max_tokens * self.token_budget["retrieved"])
        conversation_budget = int(self.max_tokens * self.token_budget["conversation"])

        # Retrieve relevant documents
        docs = await self.vector_store.search(
            query=query,
            max_tokens=retrieved_budget
        )

        # Build retrieval context
        retrieval_context = self._format_documents(docs)

        # Trim conversation to budget
        trimmed_conversation = self._trim_to_budget(
            conversation,
            conversation_budget
        )

        # Assemble final context
        return [
            Message(role="system", content=system_prompt),
            Message(role="system", content=f"Relevant information:\n{retrieval_context}"),
            *trimmed_conversation
        ]

    def _trim_to_budget(
        self,
        messages: list[Message],
        budget: int
    ) -> list[Message]:
        """Keep most recent messages within budget."""
        result = []
        used_tokens = 0

        for message in reversed(messages):
            msg_tokens = self._count_tokens(message)
            if used_tokens + msg_tokens > budget:
                break
            result.insert(0, message)
            used_tokens += msg_tokens

        return result

Advanced Patterns

Context Compression

async def compress_context(
    self,
    messages: list[Message],
    target_tokens: int
) -> list[Message]:
    """Intelligently compress context while preserving meaning."""

    current_tokens = self._count_tokens(messages)

    if current_tokens <= target_tokens:
        return messages

    compression_ratio = target_tokens / current_tokens

    # Strategy 1: Summarize older messages
    compressed = await self.llm.chat(
        messages=[
            {
                "role": "system",
                "content": f"Compress this conversation to {compression_ratio:.0%} "
                          f"of its length while preserving all key information."
            },
            {
                "role": "user",
                "content": self._format_messages(messages[:-4])
            }
        ]
    )

    return [
        Message(role="system", content=f"Previous conversation summary: {compressed}"),
        *messages[-4:]  # Keep last 4 messages verbatim
    ]

Key Takeaways

Context management is engineering. Design it deliberately.