Latency and Performance
Minimize response times with global edge deployment and intelligent optimization
Performance Overview
ParrotRouter uses multiple strategies to deliver the fastest possible AI responses, including global edge deployment, intelligent caching, and optimized routing.
50ms Average TTFT
Time to first token across all providers
Global Edge Network
25+ edge locations worldwide
99.9% Uptime
High availability with automatic failover
Streaming Responses
Get tokens as they're generated for the fastest perceived performance:
from openai import OpenAI
client = OpenAI(
base_url="https://api.parrotrouter.com/v1",
api_key="your-api-key"
)
# Stream tokens as they arrive
stream = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Write a story"}],
stream=True,
extra_headers={
"X-Stream-Buffer-Size": "1", # Send each token immediately
"X-Stream-Timeout-Ms": "100" # Max wait between tokens
}
)
# Process tokens in real-time
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
# Advanced streaming with metrics
import time
start_time = time.time()
first_token_time = None
tokens = []
for chunk in stream:
if chunk.choices[0].delta.content:
if first_token_time is None:
first_token_time = time.time()
ttft = (first_token_time - start_time) * 1000
print(f"
Time to first token: {ttft:.0f}ms
")
tokens.append(chunk.choices[0].delta.content)
print(chunk.choices[0].delta.content, end="", flush=True)
total_time = (time.time() - start_time) * 1000
print(f"
Total time: {total_time:.0f}ms")
print(f"Tokens: {len(tokens)}")
print(f"Tokens/sec: {len(tokens) / (total_time / 1000):.1f}")
Edge Deployment
Route requests through the nearest edge location for minimal latency:
Global Edge Locations
Americas
- • US East (Virginia) - 15ms
- • US West (California) - 18ms
- • Canada (Toronto) - 20ms
- • Brazil (São Paulo) - 25ms
Europe
- • EU West (Ireland) - 16ms
- • EU Central (Frankfurt) - 14ms
- • UK (London) - 15ms
- • Nordic (Stockholm) - 18ms
Asia Pacific
- • Singapore - 20ms
- • Japan (Tokyo) - 22ms
- • Australia (Sydney) - 24ms
- • India (Mumbai) - 28ms
Other Regions
- • Middle East (Dubai) - 30ms
- • Africa (Cape Town) - 35ms
- • China (Hong Kong) - 25ms
Edge Routing Configuration
# Automatic edge routing (recommended)
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Hello"}]
)
# Automatically routes through nearest edge
# Force specific region
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Hello"}],
extra_headers={
"X-Edge-Location": "us-west-2",
"X-Edge-Fallback": "us-east-1,eu-west-1"
}
)
# Get edge location info
import requests
edge_info = requests.get(
"https://api.parrotrouter.com/v1/edge/nearest",
headers={"Authorization": "Bearer your-api-key"}
).json()
print(f"Nearest edge: {edge_info['location']}")
print(f"Latency: {edge_info['latency_ms']}ms")
print(f"Available models: {edge_info['cached_models']}")
Performance Optimization
Parallel Processing
Process multiple requests simultaneously for better throughput.
import asyncio
from openai import AsyncOpenAI
client = AsyncOpenAI(
base_url="https://api.parrotrouter.com/v1",
api_key="your-api-key"
)
async def process_batch(prompts):
# Create tasks for parallel execution
tasks = []
for prompt in prompts:
task = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
extra_headers={
"X-Batch-Mode": "true",
"X-Batch-Priority": "throughput" # or "latency"
}
)
tasks.append(task)
# Execute all requests in parallel
responses = await asyncio.gather(*tasks)
return responses
# Process batch
prompts = ["Question 1", "Question 2", "Question 3"]
results = await process_batch(prompts)
# With connection pooling
async def optimized_batch(prompts):
async with client:
# Reuse connections for better performance
responses = await asyncio.gather(*[
client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": p}]
) for p in prompts
])
return responses
Token Optimization
Reduce tokens for faster responses and lower costs.
# Optimize prompt length
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{
"role": "user",
"content": "Summarize in 50 words: [long text]"
}],
max_tokens=100, # Limit response length
extra_headers={
"X-Token-Budget": "500", # Total token budget
"X-Compression": "true", # Enable prompt compression
"X-Strip-Whitespace": "true" # Remove extra spaces
}
)
# Use prompt templates for efficiency
template_response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "{task}"}],
extra_headers={
"X-Template-ID": "summary-template",
"X-Template-Vars": json.dumps({
"task": "Summarize this article"
})
}
)
Connection Optimization
Optimize network connections for better performance.
# HTTP/2 with keep-alive
import httpx
# Create optimized client
http_client = httpx.Client(
http2=True,
limits=httpx.Limits(
max_keepalive_connections=10,
max_connections=100,
keepalive_expiry=30
)
)
client = OpenAI(
base_url="https://api.parrotrouter.com/v1",
api_key="your-api-key",
http_client=http_client
)
# DNS prefetch and connection warming
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Hello"}],
extra_headers={
"X-Connection-Warm": "true",
"X-DNS-Prefetch": "true",
"X-TCP-Nodelay": "true" # Disable Nagle's algorithm
}
)
Performance Monitoring
Track and analyze performance metrics in real-time:
# Get detailed performance metrics
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": "Analyze this"}],
extra_headers={
"X-Include-Metrics": "true"
}
)
# Access performance data
metrics = response.metrics
print(f"Time to first token: {metrics['ttft_ms']}ms")
print(f"Total latency: {metrics['total_latency_ms']}ms")
print(f"Tokens per second: {metrics['tokens_per_second']}")
print(f"Queue time: {metrics['queue_time_ms']}ms")
print(f"Model inference time: {metrics['inference_time_ms']}ms")
print(f"Edge location: {metrics['edge_location']}")
print(f"Provider: {metrics['provider']}")
# Real-time performance monitoring
import requests
# Get current performance stats
stats = requests.get(
"https://api.parrotrouter.com/v1/performance/current",
headers={"Authorization": "Bearer your-api-key"}
).json()
print(f"Current latency: {stats['current_latency_ms']}ms")
print(f"Request queue: {stats['queue_length']}")
print(f"Active models: {stats['active_models']}")
# Historical performance data
history = requests.get(
"https://api.parrotrouter.com/v1/performance/history",
headers={"Authorization": "Bearer your-api-key"},
params={
"period": "1h",
"metric": "latency",
"groupBy": "model"
}
).json()
for model_stats in history['models']:
print(f"
{model_stats['model']}:")
print(f" Avg latency: {model_stats['avg_latency_ms']}ms")
print(f" P50: {model_stats['p50_latency_ms']}ms")
print(f" P95: {model_stats['p95_latency_ms']}ms")
print(f" P99: {model_stats['p99_latency_ms']}ms")
Latency Benchmarks
Time to First Token (TTFT)
Tokens Per Second
Best Practices
- 1.Use Streaming for User-Facing Apps
Stream responses to improve perceived performance
- 2.Implement Client-Side Caching
Cache frequent requests to eliminate network latency
- 3.Batch Similar Requests
Process multiple requests together for better throughput
- 4.Monitor Performance Metrics
Track latency patterns to identify optimization opportunities