Images & PDFs
Analyze images, extract text from PDFs, and process visual content with AI
Multimodal Processing
ParrotRouter seamlessly routes image and document processing requests to vision-capable models, handling format conversion, optimization, and intelligent model selection automatically.
Image Analysis
Describe, analyze, and extract data from images
PDF Processing
Extract text, tables, and analyze documents
OCR & More
Read text from images and handwritten content
Image Processing
Send images to vision-capable models for analysis and understanding:
from openai import OpenAI
import base64
client = OpenAI(
base_url="https://api.parrotrouter.com/v1",
api_key="your-api-key"
)
# Method 1: Base64 encoded image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
base64_image = encode_image("product.jpg")
response = client.chat.completions.create(
model="gpt-4-vision-preview", # Or use "auto:vision"
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high" # "low", "high", or "auto"
}
}
]
}]
)
# Method 2: Image URL
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Describe this chart in detail"},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/chart.png"
}
}
]
}]
)
print(response.choices[0].message.content)
Advanced Image Analysis
Multiple Images
Analyze multiple images in a single request for comparison or sequential analysis.
# Compare multiple images
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two designs and suggest improvements"},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{design1_base64}"}
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{design2_base64}"}
}
]
}],
extra_headers={
"X-Image-Processing": "parallel", # Process images in parallel
"X-Max-Image-Size": "20MB"
}
)
# Sequential image analysis
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "I'll show you a series of images. Track the changes."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img1}"}}
]
},
{
"role": "assistant",
"content": "I can see the first image shows..."
},
{
"role": "user",
"content": [
{"type": "text", "text": "Now here's the second image"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img2}"}}
]
}
]
response = client.chat.completions.create(
model="claude-3-opus",
messages=messages
)
Image + Text Context
Combine images with detailed context for better analysis.
# Medical image analysis with context
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "system",
"content": "You are a medical imaging assistant. Always note that you cannot provide diagnoses."
},
{
"role": "user",
"content": [
{
"type": "text",
"text": """Patient info: 45-year-old male, chest X-ray
Symptoms: Persistent cough for 2 weeks
Please describe what you observe in the image."""
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{xray_base64}",
"detail": "high"
}
}
]
}
],
extra_headers={
"X-Safety-Level": "medical",
"X-Compliance-Mode": "HIPAA"
}
)
Image Preprocessing
ParrotRouter can preprocess images for optimal model performance.
# Automatic image optimization
response = client.chat.completions.create(
model="auto:vision",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Extract all text from this receipt"},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{receipt_base64}"}
}
]
}],
extra_headers={
"X-Image-Preprocessing": json.dumps({
"enhance_contrast": True,
"auto_rotate": True,
"remove_noise": True,
"optimize_for": "ocr"
})
}
)
PDF Processing
Process PDF documents for text extraction, analysis, and understanding:
# Method 1: Upload PDF directly
with open("document.pdf", "rb") as pdf_file:
pdf_base64 = base64.b64encode(pdf_file.read()).decode('utf-8')
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Summarize this PDF document"},
{
"type": "document",
"document": {
"url": f"data:application/pdf;base64,{pdf_base64}",
"pages": "1-5" # Optional: specify pages
}
}
]
}],
extra_headers={
"X-PDF-Processing": "native", # Use native PDF processing
"X-Extract-Images": "true", # Extract embedded images
"X-Extract-Tables": "true" # Extract tables as structured data
}
)
# Method 2: Pre-convert PDF to images (for better compatibility)
import pdf2image
# Convert PDF pages to images
pages = pdf2image.convert_from_path('document.pdf', dpi=300)
# Process each page
all_content = []
for i, page in enumerate(pages[:5]): # First 5 pages
# Convert PIL image to base64
buffered = io.BytesIO()
page.save(buffered, format="PNG")
img_base64 = base64.b64encode(buffered.getvalue()).decode()
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": f"Extract text from page {i+1}"},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_base64}"}
}
]
}]
)
all_content.append(response.choices[0].message.content)
# Combine results
full_text = "\n\n".join(all_content)
Use Cases
Document Data Extraction
Extract structured data from invoices, receipts, and forms.
# Extract invoice data
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": """Extract the following from this invoice:
- Invoice number
- Date
- Total amount
- Line items with quantities and prices
Return as JSON."""
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{invoice_base64}"}
}
]
}],
response_format={"type": "json_object"}
)
invoice_data = json.loads(response.choices[0].message.content)
print(f"Invoice #{invoice_data['invoice_number']}")
print(f"Total: $" + str(invoice_data['total_amount']))
Visual QA System
Answer questions about images and diagrams.
# Technical diagram analysis
response = client.chat.completions.create(
model="claude-3-opus",
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "Explain how this circuit works and identify all components"
},
{
"type": "image_url",
"image_url": {"url": circuit_diagram_url}
}
]
}],
extra_headers={
"X-Domain-Knowledge": "electronics",
"X-Response-Detail": "technical"
}
)
# Educational content
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Create 5 quiz questions based on this diagram"},
{"type": "image_url", "image_url": {"url": biology_diagram_url}}
]
}]
)
Content Moderation
Analyze images for inappropriate content or policy violations.
# Content moderation
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": """Analyze this image for:
1. Inappropriate content
2. Violence or gore
3. Personal information
4. Copyright concerns
Return safety scores for each category."""
},
{
"type": "image_url",
"image_url": {"url": user_uploaded_image}
}
]
}],
extra_headers={
"X-Safety-Mode": "strict",
"X-Moderation-Categories": "all"
}
)
Accessibility Enhancement
Generate alt text and descriptions for accessibility.
# Generate alt text
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": """Generate:
1. A concise alt text (under 125 chars)
2. A detailed description for screen readers
3. Key visual elements list"""
},
{
"type": "image_url",
"image_url": {"url": website_image_url}
}
]
}],
extra_headers={
"X-Accessibility-Level": "WCAG-AA"
}
)
Supported Formats
Image Formats
- JPEGPhotos, general images
- PNGScreenshots, diagrams
- GIFStatic only (first frame)
- WebPModern web images
- BMPUncompressed images
- SVGVector graphics (rasterized)
Document Formats
- PDFNative or image conversion
- TIFFMulti-page documents
- HEICApple photos (converted)
Vision Models
GPT-4 Vision
Best for general analysis
Claude 3 Opus
Excellent for detailed analysis
Claude 3 Sonnet
Balanced performance
Gemini Pro Vision
Fast and cost-effective
Best Practices
- 1.Optimize Image Size
Resize images to necessary resolution before uploading
- 2.Use Appropriate Detail Level
Use "low" detail for quick analysis, "high" for precision
- 3.Provide Clear Instructions
Be specific about what you want analyzed in the image
- 4.Consider Privacy
Remove sensitive information before processing