Quick Fix
Common Timeout & Connection Errors
Network errors are discussed in detail in this Stack Overflow guide andAirbyte's production deployment guide.
ETIMEDOUT
Connection attempt timed out. Server didn't respond within the timeout period.
Error: connect ETIMEDOUT
ECONNREFUSED
Connection refused by server. Service may be down or firewall blocking.
Error: connect ECONNREFUSED
Socket Hang Up
Connection closed unexpectedly. Often due to server overload or network issues.
Error: socket hang up
Request Timeout
Request took too long to complete. Common with large prompts or responses.
TimeoutError: Request timeout
Proper Timeout Configuration
Different LLM operations require different timeout settings. Implementation patterns are shown inthis unified LLM API adapter andthis comprehensive implementation guide.
Python: Comprehensive Timeout Setup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
class TimeoutHTTPAdapter(HTTPAdapter):
"""Custom adapter with timeout support"""
def __init__(self, timeout=None, *args, **kwargs):
self.timeout = timeout
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
kwargs["timeout"] = kwargs.get("timeout") or self.timeout
return super().send(request, **kwargs)
def create_resilient_session(
connect_timeout=10,
read_timeout=60,
total_timeout=300,
max_retries=3
):
"""Create a session with proper timeout and retry configuration"""
session = requests.Session()
# Configure retry strategy
retry_strategy = Retry(
total=max_retries,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "TRACE", "POST"],
backoff_factor=1, # Wait 1s, 2s, 4s between retries
raise_on_status=False
)
# Set up adapter with timeouts
adapter = TimeoutHTTPAdapter(
timeout=(connect_timeout, read_timeout),
max_retries=retry_strategy
)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
# Usage for different scenarios
class LLMClient:
def __init__(self):
# Quick operations (list models, etc.)
self.quick_session = create_resilient_session(
connect_timeout=5,
read_timeout=10,
max_retries=2
)
# Standard chat completions
self.chat_session = create_resilient_session(
connect_timeout=10,
read_timeout=60,
max_retries=3
)
# Long operations (large context, complex tasks)
self.long_session = create_resilient_session(
connect_timeout=15,
read_timeout=300, # 5 minutes
max_retries=2
)
def chat_completion(self, messages, model="gpt-4", stream=False):
"""Make chat completion with appropriate timeout"""
# Estimate timeout based on input size
total_tokens = sum(len(m["content"].split()) * 1.3 for m in messages)
if stream:
# Streaming needs longer timeouts
timeout = (10, 600) # 10s connect, 10min read
elif total_tokens > 2000:
# Use long session for large inputs
return self.long_session.post(...)
else:
# Standard timeout for normal requests
return self.chat_session.post(...)
# Example usage
client = LLMClient()
try:
response = client.chat_completion([
{"role": "user", "content": "Hello, how are you?"}
])
except requests.Timeout:
print("Request timed out. Try reducing prompt size or using streaming.")
TypeScript/Node.js: Advanced Timeout Handling
import axios, { AxiosInstance, AxiosError } from 'axios';
import { Agent } from 'https';
interface TimeoutConfig {
connect: number;
socket: number;
response: number;
total: number;
}
class ResilientAPIClient {
private clients: Map<string, AxiosInstance> = new Map();
constructor() {
this.setupClients();
}
private setupClients(): void {
// Quick operations client
this.clients.set('quick', this.createClient({
connect: 5000,
socket: 10000,
response: 10000,
total: 15000
}));
// Standard operations client
this.clients.set('standard', this.createClient({
connect: 10000,
socket: 60000,
response: 60000,
total: 120000
}));
// Long operations client
this.clients.set('long', this.createClient({
connect: 15000,
socket: 300000,
response: 300000,
total: 600000
}));
}
private createClient(timeouts: TimeoutConfig): AxiosInstance {
// Create HTTPS agent with keep-alive
const httpsAgent = new Agent({
keepAlive: true,
keepAliveMsecs: 30000,
timeout: timeouts.socket,
});
const client = axios.create({
timeout: timeouts.total,
httpsAgent,
// Signal-based timeout for better control
signal: AbortSignal.timeout(timeouts.total),
validateStatus: (status) => status < 500,
});
// Add request interceptor for timeout configuration
client.interceptors.request.use((config) => {
// Set different timeouts for different phases
config.timeout = timeouts.response;
config.transitional = {
clarifyTimeoutError: true,
};
return config;
});
// Add response interceptor for retry logic
client.interceptors.response.use(
(response) => response,
async (error: AxiosError) => {
if (error.code === 'ECONNABORTED' || error.code === 'ETIMEDOUT') {
console.log('Timeout occurred:', {
code: error.code,
message: error.message,
url: error.config?.url,
});
// Retry once for timeout errors
if (!error.config?._retry) {
error.config._retry = true;
return client.request(error.config);
}
}
if (error.code === 'ECONNREFUSED') {
console.error('Connection refused. Check if the API is accessible.');
}
throw error;
}
);
return client;
}
async makeRequest(
url: string,
data: any,
options: {
timeout?: 'quick' | 'standard' | 'long',
streaming?: boolean
} = {}
) {
const clientType = options.timeout || 'standard';
const client = this.clients.get(clientType)!;
if (options.streaming) {
// Special handling for streaming responses
return this.makeStreamingRequest(url, data);
}
try {
const response = await client.post(url, data);
return response.data;
} catch (error) {
if (axios.isAxiosError(error)) {
if (error.code === 'ECONNABORTED') {
throw new Error(
`Request timeout after ${clientType} timeout period. \
Consider using streaming for large requests.`
);
}
}
throw error;
}
}
private async makeStreamingRequest(url: string, data: any) {
const client = this.clients.get('long')!;
const response = await client.post(url, data, {
responseType: 'stream',
timeout: 0, // No timeout for streaming
});
// Handle stream with timeout for inactivity
let lastActivity = Date.now();
const inactivityTimeout = 30000; // 30 seconds
const checkInactivity = setInterval(() => {
if (Date.now() - lastActivity > inactivityTimeout) {
response.data.destroy();
clearInterval(checkInactivity);
throw new Error('Stream timeout due to inactivity');
}
}, 5000);
response.data.on('data', () => {
lastActivity = Date.now();
});
response.data.on('end', () => {
clearInterval(checkInactivity);
});
return response.data;
}
}
// Usage
const client = new ResilientAPIClient();
// Quick operation
const models = await client.makeRequest(
'https://api.openai.com/v1/models',
null,
{ timeout: 'quick' }
);
// Long operation with large context
const completion = await client.makeRequest(
'https://api.openai.com/v1/chat/completions',
{
model: 'gpt-4',
messages: veryLongConversation
},
{ timeout: 'long' }
);
// Streaming for real-time responses
const stream = await client.makeRequest(
'https://api.openai.com/v1/chat/completions',
{
model: 'gpt-4',
messages: messages,
stream: true
},
{ streaming: true }
);
Go: Robust Timeout Implementation
package main
import (
"context"
"fmt"
"net"
"net/http"
"time"
)
type TimeoutConfig struct {
DialTimeout time.Duration
TLSTimeout time.Duration
ResponseTimeout time.Duration
KeepAlive time.Duration
IdleConnTimeout time.Duration
}
func NewResilientClient(config TimeoutConfig) *http.Client {
// Custom transport with timeout configurations
transport := &http.Transport{
DialContext: (&net.Dialer{
Timeout: config.DialTimeout,
KeepAlive: config.KeepAlive,
}).DialContext,
TLSHandshakeTimeout: config.TLSTimeout,
ResponseHeaderTimeout: config.ResponseTimeout,
IdleConnTimeout: config.IdleConnTimeout,
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
}
return &http.Client{
Transport: transport,
Timeout: config.ResponseTimeout,
}
}
// Request wrapper with context timeout
func MakeRequestWithTimeout(
client *http.Client,
req *http.Request,
timeout time.Duration,
) (*http.Response, error) {
// Create context with timeout
ctx, cancel := context.WithTimeout(req.Context(), timeout)
defer cancel()
// Create channel for response
type result struct {
resp *http.Response
err error
}
resultChan := make(chan result, 1)
// Make request in goroutine
go func() {
resp, err := client.Do(req.WithContext(ctx))
resultChan <- result{resp, err}
}()
// Wait for response or timeout
select {
case <-ctx.Done():
return nil, fmt.Errorf("request timeout after %v", timeout)
case res := <-resultChan:
return res.resp, res.err
}
}
// Retry with exponential backoff for network errors
func RequestWithRetry(
client *http.Client,
req *http.Request,
maxRetries int,
) (*http.Response, error) {
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
// Clone request for retry
reqCopy := req.Clone(req.Context())
// Calculate timeout based on attempt
timeout := time.Duration(30+attempt*30) * time.Second
resp, err := MakeRequestWithTimeout(client, reqCopy, timeout)
if err == nil && resp.StatusCode < 500 {
return resp, nil
}
lastErr = err
// Check if error is retryable
if err != nil {
if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
fmt.Printf("Timeout on attempt %d, retrying...\n", attempt+1)
} else if opErr, ok := err.(*net.OpError); ok {
fmt.Printf("Network error: %v, retrying...\n", opErr)
} else {
// Non-retryable error
return nil, err
}
}
// Exponential backoff
if attempt < maxRetries {
backoff := time.Duration(attempt+1) * time.Second
time.Sleep(backoff)
}
}
return nil, fmt.Errorf("max retries exceeded: %w", lastErr)
}
// Usage example
func main() {
// Configure different clients for different use cases
// Quick operations (list models, etc)
quickClient := NewResilientClient(TimeoutConfig{
DialTimeout: 5 * time.Second,
TLSTimeout: 5 * time.Second,
ResponseTimeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
IdleConnTimeout: 90 * time.Second,
})
// Long operations (large prompts)
longClient := NewResilientClient(TimeoutConfig{
DialTimeout: 10 * time.Second,
TLSTimeout: 10 * time.Second,
ResponseTimeout: 5 * time.Minute,
KeepAlive: 30 * time.Second,
IdleConnTimeout: 90 * time.Second,
})
// Make request with retry
req, _ := http.NewRequest("POST", "https://api.openai.com/v1/chat/completions", body)
resp, err := RequestWithRetry(longClient, req, 3)
if err != nil {
log.Fatalf("Request failed: %v", err)
}
defer resp.Body.Close()
}
Connection Pooling & Keep-Alive
Reuse connections to avoid handshake overhead:
Python: Connection Pool Configuration
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
import ssl
class SSLAdapter(HTTPAdapter):
"""Custom adapter with connection pooling and SSL configuration"""
def init_poolmanager(self, *args, **kwargs):
# Configure connection pool
kwargs["maxsize"] = 50 # Max connections per host
kwargs["block"] = False # Don't block when pool is full
kwargs["retries"] = 3
# SSL configuration
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = True
ssl_context.verify_mode = ssl.CERT_REQUIRED
kwargs["ssl_context"] = ssl_context
return super().init_poolmanager(*args, **kwargs)
# Global session with connection pooling
session = requests.Session()
# Configure adapters
adapter = SSLAdapter(
pool_connections=10, # Number of connection pools
pool_maxsize=50, # Connections per pool
max_retries=3,
)
session.mount("http://", adapter)
session.mount("https://", adapter)
# Keep-alive headers
session.headers.update({
"Connection": "keep-alive",
"Keep-Alive": "timeout=30, max=100"
})
Node.js: HTTP Agent Configuration
const https = require('https');
const http = require('http');
// Configure agents with connection pooling
const httpsAgent = new https.Agent({
keepAlive: true,
keepAliveMsecs: 30000,
maxSockets: 50, // Total sockets
maxFreeSockets: 10, // Max idle sockets
timeout: 60000, // Socket timeout
scheduling: 'fifo', // Fair scheduling
});
const httpAgent = new http.Agent({
keepAlive: true,
keepAliveMsecs: 30000,
maxSockets: 50,
maxFreeSockets: 10,
timeout: 60000,
});
// Axios configuration
const axiosConfig = {
httpAgent: httpAgent,
httpsAgent: httpsAgent,
timeout: 30000,
maxRedirects: 5,
decompress: true,
};
// Monitor connection pool
setInterval(() => {
console.log('HTTPS Pool Status:', {
sockets: Object.keys(httpsAgent.sockets).length,
freeSockets: Object.keys(httpsAgent.freeSockets).length,
requests: Object.keys(httpsAgent.requests).length,
});
}, 30000);
Handling Specific Network Errors
ETIMEDOUT - Connection Timeout
import socket
import errno
def handle_timeout_error(func):
"""Decorator to handle timeout errors gracefully"""
def wrapper(*args, **kwargs):
max_attempts = 3
base_timeout = 10
for attempt in range(max_attempts):
try:
# Increase timeout with each attempt
if 'timeout' in kwargs:
kwargs['timeout'] = base_timeout * (attempt + 1)
return func(*args, **kwargs)
except socket.timeout:
print(f"Socket timeout on attempt {attempt + 1}")
if attempt == max_attempts - 1:
raise
time.sleep(2 ** attempt) # Exponential backoff
except OSError as e:
if e.errno == errno.ETIMEDOUT:
print(f"Connection timeout on attempt {attempt + 1}")
if attempt == max_attempts - 1:
raise
time.sleep(2 ** attempt)
else:
raise
return wrapper
@handle_timeout_error
def make_api_call(url, data, timeout=30):
return requests.post(url, json=data, timeout=timeout)
ECONNREFUSED - Connection Refused
async function handleConnectionRefused(
url: string,
options: RequestInit,
maxRetries = 3
): Promise<Response> {
const delays = [1000, 5000, 10000]; // Increasing delays
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
const response = await fetch(url, options);
return response;
} catch (error: any) {
if (error.code === 'ECONNREFUSED') {
console.error(`Connection refused (attempt ${attempt + 1}/${maxRetries})`);
// Check alternative endpoints
const alternativeUrls = [
url,
url.replace('https://', 'http://'), // Try HTTP
url.replace(':443', ':80'), // Try different port
];
for (const altUrl of alternativeUrls) {
try {
const response = await fetch(altUrl, options);
console.log(`Success with alternative URL: ${altUrl}`);
return response;
} catch (altError) {
continue;
}
}
if (attempt < maxRetries - 1) {
await new Promise(resolve => setTimeout(resolve, delays[attempt]));
}
} else {
throw error;
}
}
}
throw new Error('Connection refused after all retries');
}
Socket Hang Up - Unexpected Disconnection
class SocketHangupHandler:
"""Handle socket hang up errors with intelligent retry"""
def __init__(self):
self.session = requests.Session()
self.configure_session()
def configure_session(self):
# Disable connection pooling for problematic endpoints
self.session.mount('https://', HTTPAdapter(
pool_connections=1,
pool_maxsize=1,
max_retries=Retry(
total=3,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504],
allowed_methods=["GET", "POST"],
raise_on_status=False
)
))
def make_request_with_recovery(self, url, data, chunk_size=1000):
"""Make request with socket hang up recovery"""
try:
response = self.session.post(url, json=data, stream=True)
# Read response in chunks to detect hang ups early
content = b""
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
content += chunk
return json.loads(content)
except requests.exceptions.ChunkedEncodingError:
print("Socket hung up during response. Retrying with smaller chunk size...")
return self.make_request_with_recovery(url, data, chunk_size=500)
except requests.exceptions.ConnectionError as e:
if "Connection aborted" in str(e):
print("Connection aborted. Waiting before retry...")
time.sleep(5)
# Create new session to reset connection
self.session = requests.Session()
self.configure_session()
return self.make_request_with_recovery(url, data)
Handling Streaming Timeouts
Streaming responses need special timeout handling:
import asyncio
import aiohttp
from typing import AsyncIterator
async def stream_with_timeout(
url: str,
headers: dict,
data: dict,
chunk_timeout: float = 30.0,
total_timeout: float = 300.0
) -> AsyncIterator[str]:
"""Stream with per-chunk and total timeout"""
timeout = aiohttp.ClientTimeout(
total=total_timeout,
connect=10,
sock_connect=10,
sock_read=chunk_timeout
)
async with aiohttp.ClientSession(timeout=timeout) as session:
try:
async with session.post(url, headers=headers, json=data) as response:
response.raise_for_status()
last_chunk_time = asyncio.get_event_loop().time()
async for chunk in response.content.iter_chunked(1024):
current_time = asyncio.get_event_loop().time()
# Check for inactivity
if current_time - last_chunk_time > chunk_timeout:
raise asyncio.TimeoutError("Stream inactive for too long")
last_chunk_time = current_time
yield chunk.decode('utf-8')
except asyncio.TimeoutError:
print("Stream timeout detected")
raise
except aiohttp.ClientError as e:
print(f"Stream error: {e}")
raise
# Usage with timeout handling
async def process_stream():
try:
async for chunk in stream_with_timeout(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": "Bearer YOUR_KEY"},
data={
"model": "gpt-4",
"messages": messages,
"stream": True
}
):
print(chunk, end='', flush=True)
except asyncio.TimeoutError:
print("\nStream timed out. Response may be incomplete.")
except Exception as e:
print(f"\nStream error: {e}")
Network Monitoring & Diagnostics
import time
import statistics
from collections import deque
from dataclasses import dataclass
from typing import List
@dataclass
class NetworkMetrics:
timestamp: float
duration: float
status_code: int
error: str = None
timeout: bool = False
class NetworkMonitor:
"""Monitor network performance and issues"""
def __init__(self, window_size: int = 100):
self.metrics: deque = deque(maxlen=window_size)
self.timeout_count = 0
self.error_count = 0
def record_request(self, start_time: float, response=None, error=None):
"""Record metrics for a request"""
duration = time.time() - start_time
metric = NetworkMetrics(
timestamp=start_time,
duration=duration,
status_code=response.status_code if response else 0,
error=str(error) if error else None,
timeout='timeout' in str(error).lower() if error else False
)
self.metrics.append(metric)
if metric.timeout:
self.timeout_count += 1
if metric.error:
self.error_count += 1
def get_statistics(self) -> dict:
"""Get network performance statistics"""
if not self.metrics:
return {}
durations = [m.duration for m in self.metrics if not m.error]
if not durations:
return {"error_rate": 1.0}
return {
"avg_duration": statistics.mean(durations),
"median_duration": statistics.median(durations),
"p95_duration": statistics.quantiles(durations, n=20)[18],
"p99_duration": statistics.quantiles(durations, n=100)[98],
"error_rate": self.error_count / len(self.metrics),
"timeout_rate": self.timeout_count / len(self.metrics),
"total_requests": len(self.metrics)
}
def should_circuit_break(self) -> bool:
"""Determine if circuit breaker should activate"""
stats = self.get_statistics()
# Circuit break if error rate > 50% or timeout rate > 30%
return (
stats.get("error_rate", 0) > 0.5 or
stats.get("timeout_rate", 0) > 0.3
)
# Usage
monitor = NetworkMonitor()
def monitored_request(url, data):
start_time = time.time()
try:
response = requests.post(url, json=data, timeout=30)
monitor.record_request(start_time, response=response)
return response
except Exception as e:
monitor.record_request(start_time, error=e)
# Check if we should stop making requests
if monitor.should_circuit_break():
raise Exception("Circuit breaker activated due to high error rate")
raise
# Periodic monitoring
def print_network_stats():
stats = monitor.get_statistics()
print(f"""
Network Statistics:
- Average response time: {stats.get('avg_duration', 0):.2f}s
- P95 response time: {stats.get('p95_duration', 0):.2f}s
- Error rate: {stats.get('error_rate', 0):.1%}
- Timeout rate: {stats.get('timeout_rate', 0):.1%}
""")
Best Practices
Do's
- ✓ Set appropriate timeouts for different operations
- ✓ Use connection pooling and keep-alive
- ✓ Implement exponential backoff for retries
- ✓ Monitor network performance metrics
- ✓ Handle streaming timeouts separately
- ✓ Use circuit breakers for failing services
- ✓ Log timeout errors with context
Don'ts
- ✗ Don't use infinite timeouts
- ✗ Don't retry immediately after failure
- ✗ Don't ignore connection pool limits
- ✗ Don't use same timeout for all operations
- ✗ Don't create new connections for each request
- ✗ Don't ignore network error patterns
- ✗ Don't set timeouts too low
Troubleshooting Guide
Quick Diagnosis Steps
- 1. Test basic connectivity
# Test DNS resolution nslookup api.openai.com # Test TCP connection telnet api.openai.com 443 # Test with curl curl -v --connect-timeout 5 https://api.openai.com/v1/models
- 2. Check firewall/proxy settings
# Check proxy settings echo $HTTP_PROXY $HTTPS_PROXY # Test without proxy unset HTTP_PROXY HTTPS_PROXY curl https://api.openai.com/v1/models
- 3. Verify SSL/TLS
# Check SSL certificate openssl s_client -connect api.openai.com:443 -servername api.openai.com
- 4. Monitor network latency
# Check latency ping -c 10 api.openai.com # Trace route traceroute api.openai.com
References
- [1] OpenAI. "Error Codes Reference" (2024)
- [2] Anthropic. "API Errors" (2024)
- [3] Stack Overflow. "OpenAI API Questions" (2024)