Quick Fix
Understanding Token Limit Errors
Token limits are documented in OpenAI's error guide andAnthropic's error reference. Each model has a maximum context window that includes both input and output tokens.
OpenAI Error
"This model's maximum context length is 8192 tokens, however you requested 8342 tokens"
Anthropic Error
"context length exceeded. Maximum: 100,000 tokens"
Token Formula
Total = Prompt + Response + System
Rule of Thumb
1 token ≈ 4 characters (English)
Current Model Context Windows
Context windows vary significantly. Check OpenAI's model list andAnthropic's models overview for the latest limits.
Model | Context Window | ~Characters | Best For |
---|---|---|---|
GPT-4 Turbo | 128,000 tokens | ~512,000 chars | Long documents, code bases |
GPT-4 | 8,192 tokens | ~32,000 chars | Standard conversations |
GPT-3.5 Turbo | 4,096-16,384 tokens | ~16,000-65,000 chars | Quick tasks, chat |
Claude 3 (all) | 200,000 tokens | ~800,000 chars | Books, long analysis |
Gemini 1.5 Pro | 1,000,000 tokens | ~4,000,000 chars | Entire codebases |
Token Counting Tools
Accurate token counting is essential. Use official tokenizers: OpenAI's tokenizer ortiktoken library for programmatic use.
Python: Using tiktoken
import tiktoken
from typing import List, Dict
class TokenCounter:
"""Accurate token counting for different models"""
def __init__(self):
# Model to encoding mapping
self.model_encodings = {
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-davinci-003": "p50k_base",
}
self.encodings = {}
def get_encoding(self, model: str):
"""Get the appropriate encoding for a model"""
encoding_name = self.model_encodings.get(model, "cl100k_base")
if encoding_name not in self.encodings:
self.encodings[encoding_name] = tiktoken.get_encoding(encoding_name)
return self.encodings[encoding_name]
def count_tokens(self, text: str, model: str = "gpt-4") -> int:
"""Count tokens in text for specific model"""
encoding = self.get_encoding(model)
return len(encoding.encode(text))
def count_messages_tokens(self, messages: List[Dict], model: str = "gpt-4") -> int:
"""Count tokens in chat messages format"""
encoding = self.get_encoding(model)
# Token counts vary by model
if model in ["gpt-3.5-turbo", "gpt-4"]:
tokens_per_message = 3 # <|start|>role<|end|>
tokens_per_name = 1
else:
tokens_per_message = 4
tokens_per_name = -1
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3 # Assistant response priming
return num_tokens
def truncate_to_tokens(self, text: str, max_tokens: int, model: str = "gpt-4") -> str:
"""Truncate text to fit within token limit"""
encoding = self.get_encoding(model)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return text
# Truncate and decode
truncated_tokens = tokens[:max_tokens]
return encoding.decode(truncated_tokens)
def estimate_cost(self, input_tokens: int, output_tokens: int, model: str) -> float:
"""Estimate API cost based on token usage"""
# Prices per 1K tokens (as of 2024)
pricing = {
"gpt-4-turbo": {"input": 0.01, "output": 0.03},
"gpt-4": {"input": 0.03, "output": 0.06},
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
}
model_pricing = pricing.get(model, pricing["gpt-3.5-turbo"])
input_cost = (input_tokens / 1000) * model_pricing["input"]
output_cost = (output_tokens / 1000) * model_pricing["output"]
return input_cost + output_cost
# Usage example
counter = TokenCounter()
# Count tokens in text
text = "Hello, how can I help you today?"
tokens = counter.count_tokens(text)
print(f"Token count: {tokens}")
# Count tokens in messages
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"}
]
message_tokens = counter.count_messages_tokens(messages)
print(f"Message tokens: {message_tokens}")
# Truncate text to fit
long_text = "Very long text..." * 1000
truncated = counter.truncate_to_tokens(long_text, max_tokens=1000)
print(f"Truncated to {counter.count_tokens(truncated)} tokens")
JavaScript/TypeScript: Token Counting
// Using gpt-3-encoder for Node.js
import { encode, decode } from 'gpt-3-encoder';
class TokenManager {
// Approximate token limits by model
private modelLimits: Record<string, number> = {
'gpt-4': 8192,
'gpt-4-32k': 32768,
'gpt-4-turbo': 128000,
'gpt-3.5-turbo': 4096,
'gpt-3.5-turbo-16k': 16384,
'claude-3-opus': 200000,
'claude-3-sonnet': 200000,
};
countTokens(text: string): number {
return encode(text).length;
}
countMessageTokens(messages: Array<{role: string, content: string}>): number {
let totalTokens = 0;
for (const message of messages) {
// Add tokens for message structure
totalTokens += 4; // <|im_start|>role<|im_end|>
totalTokens += this.countTokens(message.role);
totalTokens += this.countTokens(message.content);
}
totalTokens += 2; // <|im_start|>assistant
return totalTokens;
}
truncateToTokenLimit(text: string, maxTokens: number): string {
const tokens = encode(text);
if (tokens.length <= maxTokens) {
return text;
}
const truncatedTokens = tokens.slice(0, maxTokens);
return decode(truncatedTokens);
}
canFitResponse(
messages: Array<{role: string, content: string}>,
model: string,
desiredResponseTokens: number = 500
): boolean {
const limit = this.modelLimits[model] || 4096;
const messageTokens = this.countMessageTokens(messages);
return messageTokens + desiredResponseTokens <= limit;
}
getSafeTokenLimit(model: string, responseBuffer: number = 500): number {
const totalLimit = this.modelLimits[model] || 4096;
return totalLimit - responseBuffer;
}
}
// Usage
const tokenManager = new TokenManager();
// Check if content fits
const messages = [
{ role: 'user', content: 'Long text here...' }
];
if (!tokenManager.canFitResponse(messages, 'gpt-4')) {
console.log('Content too long! Need to truncate or chunk.');
// Get safe limit
const safeLimit = tokenManager.getSafeTokenLimit('gpt-4');
const truncated = tokenManager.truncateToTokenLimit(
messages[0].content,
safeLimit
);
messages[0].content = truncated;
}
Text Chunking Strategies
When text exceeds limits, use chunking strategies. LangChain's text splitters andthis comprehensive guide provide various approaches.
Smart Text Chunking
from typing import List, Dict, Optional
import re
class SmartTextChunker:
"""Intelligent text chunking that preserves context"""
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def chunk_by_tokens(
self,
text: str,
max_tokens: int,
overlap_tokens: int = 100
) -> List[str]:
"""Chunk text with token-based sliding window"""
tokens = self.tokenizer.encode(text)
chunks = []
start = 0
while start < len(tokens):
# Take max_tokens from current position
end = min(start + max_tokens, len(tokens))
chunk_tokens = tokens[start:end]
# Decode back to text
chunk_text = self.tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
# Move forward with overlap
start = end - overlap_tokens if end < len(tokens) else end
return chunks
def chunk_by_sentences(
self,
text: str,
max_tokens: int,
min_chunk_size: int = 100
) -> List[str]:
"""Chunk text preserving sentence boundaries"""
# Split into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = len(self.tokenizer.encode(sentence))
# If single sentence exceeds limit, split it
if sentence_tokens > max_tokens:
if current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_tokens = 0
# Split large sentence
sub_chunks = self.chunk_by_tokens(sentence, max_tokens, 50)
chunks.extend(sub_chunks)
# If adding sentence exceeds limit, start new chunk
elif current_tokens + sentence_tokens > max_tokens:
if current_tokens >= min_chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_tokens = sentence_tokens
else:
current_chunk.append(sentence)
current_tokens += sentence_tokens
else:
current_chunk.append(sentence)
current_tokens += sentence_tokens
# Add remaining chunk
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def chunk_by_paragraphs(
self,
text: str,
max_tokens: int
) -> List[str]:
"""Chunk text preserving paragraph structure"""
paragraphs = text.split('\n\n')
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
para_tokens = len(self.tokenizer.encode(para))
if current_tokens + para_tokens > max_tokens:
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_tokens = 0
# If paragraph itself is too large
if para_tokens > max_tokens:
para_chunks = self.chunk_by_sentences(para, max_tokens)
chunks.extend(para_chunks)
else:
current_chunk = [para]
current_tokens = para_tokens
else:
current_chunk.append(para)
current_tokens += para_tokens
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return chunks
# Usage with tiktoken
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
chunker = SmartTextChunker(encoding)
long_document = "Your very long document text here..."
# Chunk by tokens with overlap
token_chunks = chunker.chunk_by_tokens(
long_document,
max_tokens=2000,
overlap_tokens=200
)
# Chunk by sentences
sentence_chunks = chunker.chunk_by_sentences(
long_document,
max_tokens=2000
)
print(f"Created {len(sentence_chunks)} chunks")
Semantic Chunking with LangChain
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
TokenTextSplitter,
NLTKTextSplitter,
SpacyTextSplitter
)
# 1. Token-based splitting
token_splitter = TokenTextSplitter(
chunk_size=1000,
chunk_overlap=100,
model_name="gpt-3.5-turbo"
)
chunks = token_splitter.split_text(long_text)
# 2. Recursive splitting (preferred for code)
recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=4000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
chunks = recursive_splitter.split_text(long_text)
# 3. Semantic splitting with NLTK
import nltk
nltk.download('punkt')
semantic_splitter = NLTKTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
chunks = semantic_splitter.split_text(long_text)
# 4. For structured documents
from langchain.text_splitter import MarkdownTextSplitter
markdown_splitter = MarkdownTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
# Preserves markdown structure
chunks = markdown_splitter.split_text(markdown_content)
Token Optimization Techniques
Reduce token usage without losing information. Techniques from Aider's token guide can significantly reduce costs.
Text Optimization Strategies
class TokenOptimizer:
"""Optimize text to use fewer tokens"""
def __init__(self, encoding):
self.encoding = encoding
def remove_extra_whitespace(self, text: str) -> str:
"""Remove unnecessary whitespace"""
# Multiple spaces to single
text = re.sub(r'\s+', ' ', text)
# Remove trailing whitespace
text = '\n'.join(line.strip() for line in text.split('\n'))
# Remove multiple blank lines
text = re.sub(r'\n\s*\n', '\n\n', text)
return text.strip()
def abbreviate_common_phrases(self, text: str) -> str:
"""Replace common phrases with abbreviations"""
replacements = {
"for example": "e.g.",
"that is": "i.e.",
"et cetera": "etc.",
"versus": "vs.",
"approximately": "~",
}
for full, abbr in replacements.items():
text = text.replace(full, abbr)
text = text.replace(full.capitalize(), abbr)
return text
def remove_redundant_content(self, text: str) -> str:
"""Remove redundant information"""
lines = text.split('\n')
seen = set()
unique_lines = []
for line in lines:
# Skip duplicate lines
line_hash = hash(line.strip().lower())
if line_hash not in seen:
seen.add(line_hash)
unique_lines.append(line)
return '\n'.join(unique_lines)
def extract_key_content(self, text: str, keywords: List[str]) -> str:
"""Extract only paragraphs containing keywords"""
paragraphs = text.split('\n\n')
relevant_paras = []
for para in paragraphs:
para_lower = para.lower()
if any(keyword.lower() in para_lower for keyword in keywords):
relevant_paras.append(para)
return '\n\n'.join(relevant_paras)
def optimize_for_tokens(self, text: str, target_tokens: int) -> str:
"""Optimize text to fit within token limit"""
# Step 1: Clean whitespace
text = self.remove_extra_whitespace(text)
current_tokens = len(self.encoding.encode(text))
if current_tokens <= target_tokens:
return text
# Step 2: Abbreviate
text = self.abbreviate_common_phrases(text)
current_tokens = len(self.encoding.encode(text))
if current_tokens <= target_tokens:
return text
# Step 3: Remove redundancy
text = self.remove_redundant_content(text)
current_tokens = len(self.encoding.encode(text))
if current_tokens <= target_tokens:
return text
# Step 4: Truncate to fit
tokens = self.encoding.encode(text)
truncated_tokens = tokens[:target_tokens]
return self.encoding.decode(truncated_tokens)
# Usage
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
optimizer = TokenOptimizer(encoding)
# Optimize document
optimized = optimizer.optimize_for_tokens(
long_document,
target_tokens=4000
)
print(f"Original tokens: {len(encoding.encode(long_document))}")
print(f"Optimized tokens: {len(encoding.encode(optimized))}")
Advanced Solutions
Hierarchical Summarization
async def hierarchical_summarization(
document: str,
chunk_size: int = 2000,
model: str = "gpt-3.5-turbo"
) -> str:
"""Summarize long documents hierarchically"""
# Step 1: Chunk document
chunks = chunk_by_tokens(document, chunk_size)
# Step 2: Summarize each chunk
chunk_summaries = []
for i, chunk in enumerate(chunks):
summary = await summarize_text(
chunk,
f"Summarize this section (part {i+1} of {len(chunks)})"
)
chunk_summaries.append(summary)
# Step 3: Combine summaries
combined = "\n\n".join(chunk_summaries)
# Step 4: If still too long, summarize again
if count_tokens(combined) > chunk_size:
final_summary = await summarize_text(
combined,
"Create a comprehensive summary of these section summaries"
)
return final_summary
return combined
async def summarize_text(text: str, instruction: str) -> str:
"""Summarize text with specific instruction"""
response = await client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a skilled summarizer."},
{"role": "user", "content": f"{instruction}:\n\n{text}"}
],
max_tokens=500
)
return response.choices[0].message.content
RAG-based Context Selection
from typing import List, Tuple
import numpy as np
class RAGContextSelector:
"""Select relevant context using embeddings"""
def __init__(self, embedding_model):
self.embedding_model = embedding_model
async def select_relevant_chunks(
self,
query: str,
chunks: List[str],
max_tokens: int,
top_k: int = 10
) -> List[str]:
"""Select most relevant chunks for query"""
# Get embeddings
query_embedding = await self.get_embedding(query)
chunk_embeddings = [
await self.get_embedding(chunk) for chunk in chunks
]
# Calculate similarities
similarities = [
self.cosine_similarity(query_embedding, chunk_emb)
for chunk_emb in chunk_embeddings
]
# Sort by relevance
chunk_scores = list(zip(chunks, similarities))
chunk_scores.sort(key=lambda x: x[1], reverse=True)
# Select top chunks within token limit
selected_chunks = []
total_tokens = 0
for chunk, score in chunk_scores[:top_k]:
chunk_tokens = count_tokens(chunk)
if total_tokens + chunk_tokens <= max_tokens:
selected_chunks.append(chunk)
total_tokens += chunk_tokens
else:
break
return selected_chunks
def cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
"""Calculate cosine similarity between vectors"""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Usage
selector = RAGContextSelector(embedding_model)
relevant_chunks = await selector.select_relevant_chunks(
query="How to handle authentication?",
chunks=document_chunks,
max_tokens=4000
)
Error Handling Implementation
from openai import OpenAI, BadRequestError
import logging
class TokenLimitHandler:
"""Handle token limit errors gracefully"""
def __init__(self, client: OpenAI):
self.client = client
self.logger = logging.getLogger(__name__)
async def safe_completion(
self,
messages: List[Dict],
model: str = "gpt-4",
max_retries: int = 3
):
"""Make completion with automatic token limit handling"""
for attempt in range(max_retries):
try:
response = await self.client.chat.completions.create(
model=model,
messages=messages
)
return response
except BadRequestError as e:
if "maximum context length" in str(e):
self.logger.warning(f"Token limit exceeded on attempt {attempt + 1}")
# Strategy 1: Switch to larger model
if model == "gpt-4" and attempt == 0:
model = "gpt-4-turbo"
self.logger.info("Switching to gpt-4-turbo")
continue
# Strategy 2: Reduce message history
if len(messages) > 2 and attempt == 1:
messages = [messages[0]] + messages[-2:]
self.logger.info("Reduced message history")
continue
# Strategy 3: Summarize content
if attempt == 2:
messages = await self.summarize_messages(messages)
self.logger.info("Summarized messages")
continue
raise
raise Exception("Failed to handle token limit after all retries")
async def summarize_messages(self, messages: List[Dict]) -> List[Dict]:
"""Summarize messages to fit within limits"""
# Keep system message
system_msg = next((m for m in messages if m["role"] == "system"), None)
# Combine and summarize user messages
user_content = " ".join(
m["content"] for m in messages if m["role"] == "user"
)
summary_response = await self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": f"Summarize concisely: {user_content[:2000]}"}
],
max_tokens=500
)
summarized = summary_response.choices[0].message.content
result = []
if system_msg:
result.append(system_msg)
result.append({"role": "user", "content": summarized})
return result
Best Practices
Do's
- ✓ Always count tokens before sending
- ✓ Implement chunking strategies
- ✓ Use appropriate models for content size
- ✓ Reserve tokens for response
- ✓ Cache token counts for efficiency
- ✓ Implement fallback strategies
- ✓ Monitor token usage for cost
Don'ts
- ✗ Don't assume character counts
- ✗ Don't ignore language differences
- ✗ Don't forget response tokens
- ✗ Don't chunk without overlap
- ✗ Don't lose context in chunks
- ✗ Don't truncate mid-sentence
- ✗ Don't ignore token costs
Quick Reference: Token Calculators
Pro Tip: Token Budget Planning
Always reserve tokens for the response. A good rule of thumb:
- Short responses: Reserve 200-500 tokens
- Detailed answers: Reserve 1000-2000 tokens
- Code generation: Reserve 2000-4000 tokens
- Creative writing: Reserve 50% of context window
References
- [1] OpenAI. "Error Codes Reference" (2024)
- [2] Anthropic. "API Errors" (2024)
- [3] Stack Overflow. "OpenAI API Questions" (2024)