How do I troubleshoot Claude API errors in web scraping applications?
When building web scraping applications with Claude API, you'll encounter various errors related to rate limits, token limits, authentication, and API responses. Understanding how to troubleshoot these errors efficiently is crucial for maintaining robust scraping workflows. This guide covers common Claude API errors, their causes, and practical solutions with code examples.
Common Claude API Error Types
1. Authentication Errors (401 Unauthorized)
Authentication errors occur when your API key is invalid, missing, or improperly formatted.
Python Example:
import anthropic
import os
def create_claude_client():
try:
api_key = os.environ.get('ANTHROPIC_API_KEY')
if not api_key:
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
client = anthropic.Anthropic(api_key=api_key)
return client
except Exception as e:
print(f"Authentication error: {e}")
return None
client = create_claude_client()
if client:
print("Successfully authenticated with Claude API")
JavaScript Example:
const Anthropic = require('@anthropic-ai/sdk');
function createClaudeClient() {
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
throw new Error('ANTHROPIC_API_KEY environment variable not set');
}
try {
const client = new Anthropic({
apiKey: apiKey,
});
return client;
} catch (error) {
console.error('Authentication error:', error.message);
return null;
}
}
const client = createClaudeClient();
Solutions: - Verify your API key is correctly set in environment variables - Check that the API key hasn't expired or been revoked - Ensure no extra whitespace or special characters in the key - Confirm you're using the correct API endpoint
2. Rate Limit Errors (429 Too Many Requests)
Rate limiting is one of the most common issues when scraping at scale with Claude API.
Python with Exponential Backoff:
import time
import random
from anthropic import Anthropic, RateLimitError
def scrape_with_retry(client, html_content, max_retries=5):
"""Scrape content with exponential backoff retry logic"""
for attempt in range(max_retries):
try:
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": f"Extract the product name, price, and description from this HTML:\n\n{html_content}"
}
]
)
return message.content[0].text
except RateLimitError as e:
if attempt == max_retries - 1:
raise
# Exponential backoff with jitter
wait_time = (2 ** attempt) + random.uniform(0, 1)
print(f"Rate limited. Waiting {wait_time:.2f} seconds before retry {attempt + 1}/{max_retries}")
time.sleep(wait_time)
except Exception as e:
print(f"Unexpected error: {e}")
raise
# Usage
client = Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'))
result = scrape_with_retry(client, "<html>...</html>")
JavaScript with Rate Limiting Queue:
const Anthropic = require('@anthropic-ai/sdk');
class RateLimitedScraper {
constructor(apiKey, requestsPerMinute = 50) {
this.client = new Anthropic({ apiKey });
this.queue = [];
this.processing = false;
this.requestsPerMinute = requestsPerMinute;
this.interval = 60000 / requestsPerMinute; // ms between requests
}
async scrape(htmlContent) {
return new Promise((resolve, reject) => {
this.queue.push({ htmlContent, resolve, reject });
this.processQueue();
});
}
async processQueue() {
if (this.processing || this.queue.length === 0) return;
this.processing = true;
const { htmlContent, resolve, reject } = this.queue.shift();
try {
const message = await this.client.messages.create({
model: 'claude-3-5-sonnet-20241022',
max_tokens: 1024,
messages: [{
role: 'user',
content: `Extract data from this HTML:\n\n${htmlContent}`
}]
});
resolve(message.content[0].text);
} catch (error) {
if (error.status === 429) {
// Re-queue the request
this.queue.unshift({ htmlContent, resolve, reject });
await this.sleep(5000); // Wait 5 seconds on rate limit
} else {
reject(error);
}
}
await this.sleep(this.interval);
this.processing = false;
this.processQueue();
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage
const scraper = new RateLimitedScraper(process.env.ANTHROPIC_API_KEY, 50);
const result = await scraper.scrape('<html>...</html>');
3. Token Limit Errors (400 Bad Request)
Claude models have context window limits. When scraping large HTML pages, you may exceed these limits.
Python - Content Chunking Strategy:
from anthropic import Anthropic
def chunk_html(html_content, max_chars=100000):
"""Split HTML into manageable chunks"""
chunks = []
current_chunk = ""
# Simple chunking by tag boundaries
tags = html_content.split('<')
for tag in tags:
if len(current_chunk) + len(tag) > max_chars:
chunks.append(current_chunk)
current_chunk = '<' + tag
else:
current_chunk += '<' + tag
if current_chunk:
chunks.append(current_chunk)
return chunks
def scrape_large_page(client, html_content):
"""Handle large HTML pages by chunking"""
# Clean HTML first to reduce size
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Remove scripts, styles, comments
for element in soup(['script', 'style', 'meta', 'link']):
element.decompose()
cleaned_html = str(soup)
# Check if chunking is needed
if len(cleaned_html) < 100000:
chunks = [cleaned_html]
else:
chunks = chunk_html(cleaned_html)
results = []
for i, chunk in enumerate(chunks):
try:
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[{
"role": "user",
"content": f"Extract data from this HTML (part {i+1}/{len(chunks)}):\n\n{chunk}"
}]
)
results.append(message.content[0].text)
except Exception as e:
print(f"Error processing chunk {i+1}: {e}")
continue
return results
4. Timeout Errors
Network timeouts can occur when Claude takes too long to process complex extraction tasks.
Python with Timeout Handling:
import requests
from anthropic import Anthropic, APITimeoutError
def scrape_with_timeout(client, html_content, timeout=60):
"""Handle API timeouts gracefully"""
try:
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=2048,
timeout=timeout, # Set custom timeout
messages=[{
"role": "user",
"content": f"Extract product information:\n\n{html_content}"
}]
)
return message.content[0].text
except APITimeoutError:
# Try with a simpler prompt or smaller content
print("Request timed out, trying with reduced content...")
simplified_html = html_content[:50000] # Reduce content size
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
timeout=timeout,
messages=[{
"role": "user",
"content": f"Extract basic info:\n\n{simplified_html}"
}]
)
return message.content[0].text
Error Logging and Monitoring
Implement comprehensive error logging to identify patterns and issues quickly.
Python Logging Example:
import logging
from datetime import datetime
from anthropic import Anthropic, APIError
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('claude_scraping.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def scrape_with_logging(client, url, html_content):
"""Scrape with comprehensive error logging"""
try:
logger.info(f"Starting scraping for URL: {url}")
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[{
"role": "user",
"content": f"Extract data from:\n\n{html_content[:1000]}"
}]
)
logger.info(f"Successfully scraped URL: {url}")
return message.content[0].text
except APIError as e:
logger.error(f"API Error for {url}: {e.status_code} - {e.message}")
logger.error(f"Error type: {type(e).__name__}")
logger.error(f"Request ID: {getattr(e, 'request_id', 'N/A')}")
raise
except Exception as e:
logger.error(f"Unexpected error for {url}: {str(e)}", exc_info=True)
raise
Best Practices for Error Prevention
1. Pre-validate Content
Before sending HTML to Claude, validate and clean it to prevent unnecessary errors.
def validate_html_for_claude(html_content, max_size=200000):
"""Validate HTML before sending to Claude"""
errors = []
if not html_content or len(html_content.strip()) == 0:
errors.append("HTML content is empty")
if len(html_content) > max_size:
errors.append(f"HTML exceeds {max_size} characters ({len(html_content)} chars)")
# Check for valid HTML structure
if '<html' not in html_content.lower() and '<body' not in html_content.lower():
errors.append("HTML appears to be a fragment, not a complete page")
return errors
# Usage
html = fetch_page(url)
validation_errors = validate_html_for_claude(html)
if validation_errors:
print(f"Validation errors: {validation_errors}")
# Handle errors or clean HTML
else:
result = scrape_with_claude(client, html)
2. Implement Circuit Breaker Pattern
Prevent cascading failures when the API is experiencing issues.
from datetime import datetime, timedelta
class CircuitBreaker:
def __init__(self, failure_threshold=5, timeout=60):
self.failure_threshold = failure_threshold
self.timeout = timeout
self.failures = 0
self.last_failure_time = None
self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN
def call(self, func, *args, **kwargs):
if self.state == 'OPEN':
if datetime.now() - self.last_failure_time > timedelta(seconds=self.timeout):
self.state = 'HALF_OPEN'
else:
raise Exception("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
self.on_success()
return result
except Exception as e:
self.on_failure()
raise
def on_success(self):
self.failures = 0
self.state = 'CLOSED'
def on_failure(self):
self.failures += 1
self.last_failure_time = datetime.now()
if self.failures >= self.failure_threshold:
self.state = 'OPEN'
# Usage
breaker = CircuitBreaker(failure_threshold=3, timeout=60)
result = breaker.call(scrape_with_claude, client, html_content)
Integration with Web Scraping Tools
When combining Claude with browser automation tools, you need additional error handling strategies. For instance, when handling timeouts in Puppeteer, you should coordinate timeout settings between both systems.
Combined Puppeteer + Claude Error Handling:
const puppeteer = require('puppeteer');
const Anthropic = require('@anthropic-ai/sdk');
async function scrapeWithPuppeteerAndClaude(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
try {
// Navigate with timeout
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
const html = await page.content();
// Process with Claude
const client = new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY
});
const message = await client.messages.create({
model: 'claude-3-5-sonnet-20241022',
max_tokens: 1024,
messages: [{
role: 'user',
content: `Extract data from:\n\n${html.substring(0, 100000)}`
}]
});
return JSON.parse(message.content[0].text);
} catch (error) {
if (error.name === 'TimeoutError') {
console.error('Puppeteer timeout:', error);
} else if (error.status === 429) {
console.error('Claude rate limit:', error);
} else {
console.error('Unknown error:', error);
}
throw error;
} finally {
await browser.close();
}
}
Similar to handling errors in Puppeteer, you should implement proper error boundaries and graceful degradation in your scraping pipeline.
Debugging Tips
1. Use Response Headers
Claude API responses include useful headers for debugging:
import anthropic
client = anthropic.Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'))
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[{"role": "user", "content": "Hello"}]
)
# Access rate limit information from response headers
print(f"Request ID: {response.id}")
print(f"Model: {response.model}")
print(f"Usage: {response.usage}")
2. Test with Minimal Examples
When encountering errors, isolate the problem with minimal test cases:
def test_claude_connection():
"""Minimal test to verify Claude API is working"""
try:
client = Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'))
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=10,
messages=[{"role": "user", "content": "Hi"}]
)
print("✓ Claude API connection successful")
return True
except Exception as e:
print(f"✗ Claude API connection failed: {e}")
return False
# Run test before starting scraping
if test_claude_connection():
start_scraping()
Conclusion
Troubleshooting Claude API errors in web scraping applications requires a multi-layered approach combining proper error handling, rate limiting, content validation, and comprehensive logging. By implementing the patterns and code examples shown in this guide, you can build robust scraping applications that gracefully handle errors and maintain high reliability even when processing large volumes of data.
Remember to always respect API rate limits, implement exponential backoff for retries, and monitor your application's error logs to identify patterns and optimize your scraping strategy over time.