Prompt Caching
Save up to 90% on costs and deliver instant responses with intelligent caching
How Caching Works
ParrotRouter automatically caches AI responses based on prompt similarity, saving costs and delivering instant responses for repeated queries. Our intelligent caching system handles exact matches, semantic similarity, and parameter variations.
Instant Responses
Serve cached results in under 5ms
90% Cost Reduction
Avoid redundant API calls to providers
Smart Invalidation
Automatic cache updates and expiration
Basic Caching
Enable caching with a simple header or parameter:
from openai import OpenAI
client = OpenAI(
base_url="https://api.parrotrouter.com/v1",
api_key="your-api-key"
)
# Enable caching for this request
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{
"role": "user",
"content": "What is the capital of France?"
}],
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-TTL": "3600" # Cache for 1 hour
}
)
# Check if response was cached
print(f"Cache hit: {response.headers.get('X-Cache-Hit', 'false')}")
print(f"Cache age: {response.headers.get('X-Cache-Age', '0')}s")
# Subsequent identical request will be instant
response2 = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{
"role": "user",
"content": "What is the capital of France?"
}],
extra_headers={
"X-Cache-Enabled": "true"
}
)
print(f"Cache hit: {response2.headers.get('X-Cache-Hit', 'true')}") # true
Caching Strategies
Exact Match Caching
Cache responses for identical prompts and parameters.
# Exact match caching - fastest, most reliable
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Translate 'Hello' to Spanish"}],
temperature=0, # Deterministic for consistent caching
extra_headers={
"X-Cache-Strategy": "exact",
"X-Cache-TTL": "86400" # 24 hours
}
)
# These will hit the cache:
# - Same prompt, same model, same parameters
# - Case-sensitive exact match
# These will NOT hit the cache:
# - "translate 'Hello' to Spanish" (lowercase t)
# - "Translate 'Hello' to Spanish." (extra period)
# - Different temperature or other parameters
Semantic Caching
Cache based on meaning, not exact text match.
# Semantic caching - flexible but slightly slower
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "What's the weather in NYC?"}],
extra_headers={
"X-Cache-Strategy": "semantic",
"X-Cache-Similarity-Threshold": "0.95", # 95% similarity required
"X-Cache-TTL": "300" # 5 minutes for weather data
}
)
# These WILL hit the cache (semantically similar):
# - "What is the weather in New York City?"
# - "Tell me the weather in NYC"
# - "How's the weather in New York?"
# These will NOT hit the cache (different meaning):
# - "What's the weather in Boston?"
# - "What was the weather in NYC yesterday?"
Parameter Normalization
Cache across parameter variations for similar results.
# Normalize parameters for better cache hits
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Write a poem"}],
temperature=0.7, # Will be normalized
extra_headers={
"X-Cache-Strategy": "normalized",
"X-Cache-Normalize-Temperature": "0.1", # Round to nearest 0.1
"X-Cache-Ignore-Parameters": "max_tokens,top_p", # Ignore these
"X-Cache-TTL": "3600"
}
)
# These will hit the same cache:
# - temperature=0.65 → normalized to 0.7
# - temperature=0.74 → normalized to 0.7
# - Different max_tokens or top_p values (ignored)
Cache Configuration
Fine-tune caching behavior for your use case:
# Configure cache behavior
cache_config = {
"strategy": "hybrid", # Combines exact and semantic
"ttl": 3600, # Default TTL in seconds
"max_size": "100MB", # Maximum cache size per key prefix
"eviction_policy": "lru", # Least recently used
"compression": "gzip", # Compress cached responses
"encryption": "aes-256", # Encrypt sensitive data
"invalidation_rules": [
{
"pattern": "weather/*",
"ttl": 300, # 5 minutes for weather
"refresh_on_hit": True # Extend TTL on cache hit
},
{
"pattern": "translation/*",
"ttl": 86400, # 24 hours for translations
"immutable": True # Never refresh
}
]
}
# Apply configuration
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Get weather for Paris"}],
extra_headers={
"X-Cache-Config": json.dumps(cache_config),
"X-Cache-Key-Prefix": "weather/paris" # Organize cache keys
}
)
# Batch cache warming
prompts = [
"Common question 1",
"Common question 2",
"Common question 3"
]
# Pre-populate cache
for prompt in prompts:
client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-Warm": "true", # Only populate cache, don't wait
"X-Cache-TTL": "86400"
}
)
Cache Management
Cache Invalidation
import requests
# Clear specific cache entry
response = requests.delete(
"https://api.parrotrouter.com/v1/cache",
headers={"Authorization": "Bearer your-api-key"},
json={
"key": "exact:gpt-4:hash123",
"pattern": "weather/*" # Or use pattern matching
}
)
# Clear all cache for a prefix
response = requests.delete(
"https://api.parrotrouter.com/v1/cache",
headers={"Authorization": "Bearer your-api-key"},
json={
"prefix": "translation/",
"older_than": "2024-01-01T00:00:00Z"
}
)
# Refresh cache (re-generate and cache)
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Cached prompt"}],
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-Refresh": "true" # Force regenerate
}
)
Cache Analytics
# Get cache statistics
stats = requests.get(
"https://api.parrotrouter.com/v1/cache/stats",
headers={"Authorization": "Bearer your-api-key"},
params={
"start_date": "2024-01-01",
"end_date": "2024-01-31",
"group_by": "day"
}
).json()
print(f"Total cache hits: {stats['total_hits']}")
print(f"Total cache misses: {stats['total_misses']}")
print(f"Hit rate: {stats['hit_rate']}%")
print(f"Cost saved: $" + str(stats['cost_saved']))
print(f"Avg response time (cached): {stats['avg_cached_latency_ms']}ms")
print(f"Avg response time (uncached): {stats['avg_uncached_latency_ms']}ms")
# Most cached prompts
for item in stats['top_cached_prompts']:
print(f"
Prompt: {item['prompt_preview']}")
print(f"Hits: {item['hits']}")
print(f"Cost saved: $" + str(item['cost_saved']))
# Cache usage by model
for model in stats['cache_by_model']:
print(f"
{model['name']}:")
print(f" Hit rate: {model['hit_rate']}%")
print(f" Storage: {model['storage_mb']}MB")
Use Cases
FAQ Chatbots
Cache common questions for instant responses
# Pre-cache common FAQs
faqs = [
"What are your business hours?",
"How do I reset my password?",
"What is your return policy?"
]
for question in faqs:
client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": question}],
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-TTL": "604800", # 1 week
"X-Cache-Key-Prefix": "faq"
}
)
Translation Services
Cache translations for common phrases
# Cache translations with long TTL
def translate(text, target_lang):
return client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{
"role": "user",
"content": f"Translate to {target_lang}: {text}"
}],
temperature=0, # Deterministic
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-TTL": "2592000", # 30 days
"X-Cache-Key": f"translate:{target_lang}:{hash(text)}"
}
)
Code Generation
Cache common coding patterns and snippets
# Cache code generation
response = client.chat.completions.create(
model="claude-3-opus",
messages=[{
"role": "user",
"content": "Write a Python function to validate email"
}],
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-Strategy": "semantic",
"X-Cache-TTL": "86400",
"X-Cache-Tags": "code,python,validation"
}
)
Data Analysis
Cache analysis results for static datasets
# Cache data analysis with dataset hash
dataset_hash = hashlib.md5(data.encode()).hexdigest()
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{
"role": "user",
"content": f"Analyze this data: {data}"
}],
extra_headers={
"X-Cache-Enabled": "true",
"X-Cache-Key": f"analysis:{dataset_hash}",
"X-Cache-TTL": "3600"
}
)
Cache Performance
Response Time Comparison
Best Practices
- 1.Cache Deterministic Prompts
Use temperature=0 for consistent caching results
- 2.Set Appropriate TTLs
Use short TTLs for dynamic content, long for static
- 3.Monitor Cache Performance
Track hit rates and adjust strategies accordingly
- 4.Use Cache Tags
Tag related cache entries for bulk invalidation