AI Cost Optimization: Keeping LLM Bills Under Control

July 24, 2023

LLM costs can surprise you. A few dollars in testing becomes thousands in production. With token-based pricing and growing usage, AI costs require the same attention as cloud infrastructure costs. The good news: there are many optimization levers.

Here’s how to keep AI costs under control.

Understanding AI Costs

Cost Drivers

cost_factors:
  tokens:
    input: Prompt tokens (cheaper)
    output: Generated tokens (more expensive)
    formula: (input_tokens * input_price) + (output_tokens * output_price)

  model_selection:
    gpt4: ~$0.03-0.06/1K tokens
    gpt35: ~$0.001-0.002/1K tokens
    difference: 30-60x cost difference

  volume:
    requests_per_day: Number of API calls
    tokens_per_request: Prompt + response size
    scaling: Linear with usage

  hidden_costs:
    retries: Failed requests still cost
    experiments: Development and testing
    inefficient_prompts: Verbose prompts

Cost Calculation

def estimate_monthly_cost(
    requests_per_day: int,
    avg_input_tokens: int,
    avg_output_tokens: int,
    model: str
) -> dict:
    pricing = {
        'gpt-4': {'input': 0.03, 'output': 0.06},
        'gpt-4-turbo': {'input': 0.01, 'output': 0.03},
        'gpt-3.5-turbo': {'input': 0.0005, 'output': 0.0015},
    }

    prices = pricing[model]

    daily_input_cost = (requests_per_day * avg_input_tokens / 1000) * prices['input']
    daily_output_cost = (requests_per_day * avg_output_tokens / 1000) * prices['output']
    daily_total = daily_input_cost + daily_output_cost

    return {
        'daily': daily_total,
        'monthly': daily_total * 30,
        'yearly': daily_total * 365,
        'input_pct': daily_input_cost / daily_total * 100,
        'output_pct': daily_output_cost / daily_total * 100
    }

# Example: Customer support chatbot
cost = estimate_monthly_cost(
    requests_per_day=10000,
    avg_input_tokens=1000,  # Context + user message
    avg_output_tokens=300,  # Response
    model='gpt-4'
)
# Monthly: ~$12,000

cost_optimized = estimate_monthly_cost(
    requests_per_day=10000,
    avg_input_tokens=500,   # Optimized prompts
    avg_output_tokens=200,  # Concise responses
    model='gpt-3.5-turbo'
)
# Monthly: ~$165 (98.6% reduction!)

Optimization Strategies

1. Model Selection

model_tiering:
  use_gpt4_for:
    - Complex reasoning
    - High-stakes decisions
    - Code generation requiring accuracy
    - Where quality directly impacts business

  use_gpt35_for:
    - Simple classification
    - Routine chat responses
    - Summarization
    - Data extraction

  use_smaller_models:
    - Fine-tuned specialists
    - Embeddings (ada)
    - Simple NLP tasks
class ModelRouter:
    def __init__(self):
        self.complexity_model = ComplexityClassifier()

    def select_model(self, request):
        complexity = self.complexity_model.assess(request)

        if complexity > 0.8 or request.high_stakes:
            return 'gpt-4'
        elif complexity > 0.4:
            return 'gpt-3.5-turbo'
        else:
            return 'gpt-3.5-turbo'  # or local model

    def route(self, request):
        model = self.select_model(request)
        return call_llm(request, model=model)

2. Prompt Optimization

prompt_optimization:
  reduce_tokens:
    - Remove verbose instructions
    - Use abbreviations where clear
    - Trim unnecessary context

  efficient_examples:
    - Few-shot: 2-3 examples, not 5-10
    - Choose diverse, representative examples
    - Keep examples concise

  structured_output:
    - Request JSON (often shorter than prose)
    - Specify maximum lengths
    - Request bullet points over paragraphs
# Before: 850 tokens
prompt_verbose = """
You are a helpful assistant that analyzes customer feedback.
Your job is to carefully read the customer feedback provided below
and determine whether the sentiment is positive, negative, or neutral.
Please consider all aspects of the feedback including the tone,
specific words used, and the overall message being conveyed.

Here are some examples:
[5 detailed examples...]

Please analyze the following feedback and provide your assessment:
{feedback}

Please respond with your analysis including reasoning and final sentiment.
"""

# After: 180 tokens
prompt_optimized = """
Classify sentiment as: positive, negative, or neutral.
Examples:
- "Great product!" → positive
- "Broken on arrival" → negative
- "It works" → neutral

Feedback: {feedback}

Sentiment:"""

3. Caching

import hashlib
import redis

class LLMCache:
    def __init__(self, redis_client, ttl=3600):
        self.redis = redis_client
        self.ttl = ttl

    def _cache_key(self, prompt, model, temperature):
        # Only cache deterministic requests
        if temperature > 0:
            return None
        content = f"{model}:{prompt}"
        return f"llm:{hashlib.sha256(content.encode()).hexdigest()}"

    def get_or_generate(self, prompt, model, temperature=0, generator=None):
        key = self._cache_key(prompt, model, temperature)

        if key:
            cached = self.redis.get(key)
            if cached:
                return {'response': cached.decode(), 'cached': True, 'cost': 0}

        response = generator(prompt, model, temperature)

        if key:
            self.redis.setex(key, self.ttl, response['content'])

        return {'response': response['content'], 'cached': False, 'cost': response['cost']}

# Semantic caching for similar queries
class SemanticCache:
    def __init__(self, embedding_model, threshold=0.95):
        self.embedding_model = embedding_model
        self.threshold = threshold
        self.cache = {}  # In production, use vector DB

    def find_similar(self, prompt):
        prompt_embedding = self.embedding_model.encode(prompt)

        for cached_prompt, (cached_embedding, response) in self.cache.items():
            similarity = cosine_similarity([prompt_embedding], [cached_embedding])[0][0]
            if similarity >= self.threshold:
                return response

        return None

4. Batching

async def batch_process(items, batch_size=20):
    """Process items in batches to reduce overhead."""
    results = []

    for i in range(0, len(items), batch_size):
        batch = items[i:i + batch_size]

        # Combine into single prompt
        combined_prompt = "Process these items:\n" + "\n".join(
            f"{j+1}. {item}" for j, item in enumerate(batch)
        ) + "\n\nRespond with numbered results."

        response = await llm.generate(combined_prompt)
        batch_results = parse_numbered_results(response)
        results.extend(batch_results)

    return results

# Single call instead of 100 calls
items = ["item1", "item2", ..., "item100"]
results = await batch_process(items, batch_size=20)  # 5 calls instead of 100

5. Output Control

# Limit output tokens
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=150  # Cap response length
)

# Request concise format
prompt = """
Summarize in exactly 2 sentences:
{document}
"""

# Request structured output
prompt = """
Extract as JSON with fields: name, email, company (max 50 chars each)
{text}
"""

Monitoring and Alerting

Cost Tracking

class CostTracker:
    def __init__(self):
        self.pricing = {
            'gpt-4': {'input': 0.03, 'output': 0.06},
            'gpt-3.5-turbo': {'input': 0.0005, 'output': 0.0015},
        }

    def track_request(self, model, input_tokens, output_tokens, metadata=None):
        prices = self.pricing[model]
        cost = (input_tokens * prices['input'] + output_tokens * prices['output']) / 1000

        # Log to metrics system
        metrics.increment('llm.requests', tags={'model': model})
        metrics.increment('llm.tokens.input', input_tokens, tags={'model': model})
        metrics.increment('llm.tokens.output', output_tokens, tags={'model': model})
        metrics.increment('llm.cost', cost, tags={'model': model})

        # Log detailed info
        logger.info('llm_request', {
            'model': model,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'cost': cost,
            **metadata
        })

        return cost

Budget Alerts

cost_alerts:
  daily_budget:
    threshold: $100
    action: Alert team

  hourly_spike:
    threshold: 200% of normal
    action: Alert on-call

  model_usage:
    gpt4_percentage: > 20%
    action: Review usage patterns

Key Takeaways

AI cost optimization is not optional. Build it in from the start.