How do I Handle Web Scraping Errors with Claude API?
Error handling is crucial when building web scraping workflows with Claude API. Since web scraping involves unpredictable network conditions, dynamic content, and API rate limits, implementing robust error handling ensures your scraping operations remain reliable and maintainable.
Understanding Common Claude API Errors
When using Claude API for web scraping tasks, you'll encounter several types of errors:
1. API-Specific Errors
- Rate Limit Errors (429): Exceeded the number of requests per minute
- Invalid API Key (401): Authentication failed
- Token Limit Exceeded (400): Request exceeds maximum context window
- Server Errors (500-503): Temporary Claude API outages
2. Web Scraping Errors
- Network Timeouts: Website doesn't respond in time
- Content Parsing Failures: Claude cannot extract structured data from malformed HTML
- Empty Responses: Website returns no content
- Dynamic Content Issues: JavaScript-rendered content not captured
3. Data Validation Errors
- Missing Fields: Expected data fields not found
- Type Mismatches: Extracted data doesn't match expected schema
- Hallucinations: Claude generates plausible but incorrect data
Implementing Robust Error Handling
Python Implementation
Here's a comprehensive Python example with exponential backoff and retry logic:
import anthropic
import time
import requests
from typing import Optional, Dict, Any
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
class ClaudeScraperError(Exception):
"""Base exception for Claude scraping errors"""
pass
class RateLimitError(ClaudeScraperError):
"""Raised when rate limit is exceeded"""
pass
class ContentExtractionError(ClaudeScraperError):
"""Raised when data extraction fails"""
pass
class ClaudeScraper:
def __init__(self, api_key: str):
self.client = anthropic.Anthropic(api_key=api_key)
self.max_retries = 3
self.timeout = 30
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception_type((RateLimitError, requests.RequestException))
)
def fetch_webpage(self, url: str) -> str:
"""Fetch webpage content with retry logic"""
try:
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
return response.text
except requests.Timeout:
raise ClaudeScraperError(f"Timeout while fetching {url}")
except requests.RequestException as e:
raise ClaudeScraperError(f"Failed to fetch {url}: {str(e)}")
def extract_data_with_claude(self, html_content: str, schema: Dict[str, str]) -> Optional[Dict[str, Any]]:
"""Extract structured data using Claude API with error handling"""
# Truncate content if too large
max_content_length = 100000 # Adjust based on your needs
if len(html_content) > max_content_length:
html_content = html_content[:max_content_length]
prompt = f"""Extract the following information from this HTML:
{', '.join(schema.keys())}
HTML content:
{html_content}
Return the data as JSON matching this schema: {schema}
If a field is not found, use null."""
for attempt in range(self.max_retries):
try:
message = self.client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{"role": "user", "content": prompt}
]
)
# Extract and parse response
response_text = message.content[0].text
import json
data = json.loads(response_text)
# Validate extracted data
if self.validate_data(data, schema):
return data
else:
raise ContentExtractionError("Validation failed for extracted data")
except anthropic.RateLimitError as e:
wait_time = 2 ** attempt
print(f"Rate limit hit. Waiting {wait_time} seconds...")
time.sleep(wait_time)
if attempt == self.max_retries - 1:
raise RateLimitError("Max retries exceeded for rate limit")
except anthropic.APIConnectionError as e:
print(f"Network error: {e}. Retrying...")
time.sleep(2 ** attempt)
except anthropic.APIStatusError as e:
if e.status_code == 400:
# Token limit exceeded - try with shorter content
html_content = html_content[:len(html_content) // 2]
print("Token limit exceeded, retrying with shorter content...")
else:
raise ClaudeScraperError(f"API error: {e.status_code} - {e.message}")
except json.JSONDecodeError:
# Claude didn't return valid JSON - try to guide it better
prompt = f"""IMPORTANT: Return ONLY valid JSON, no other text.
{prompt}"""
if attempt == self.max_retries - 1:
raise ContentExtractionError("Failed to parse Claude response as JSON")
except Exception as e:
raise ClaudeScraperError(f"Unexpected error: {str(e)}")
raise ContentExtractionError("Failed to extract data after all retries")
def validate_data(self, data: Dict[str, Any], schema: Dict[str, str]) -> bool:
"""Validate extracted data against schema"""
if not isinstance(data, dict):
return False
# Check all required fields are present
for field in schema.keys():
if field not in data:
print(f"Missing field: {field}")
return False
return True
def scrape_with_fallback(self, url: str, schema: Dict[str, str]) -> Optional[Dict[str, Any]]:
"""Main scraping method with comprehensive error handling"""
try:
# Fetch webpage
html_content = self.fetch_webpage(url)
if not html_content or len(html_content) < 100:
raise ContentExtractionError("Empty or invalid HTML content")
# Extract data with Claude
data = self.extract_data_with_claude(html_content, schema)
return data
except RateLimitError as e:
print(f"Rate limit error: {e}")
# Implement queue system or notify admin
return None
except ContentExtractionError as e:
print(f"Content extraction error: {e}")
# Fall back to traditional parsing if available
return None
except ClaudeScraperError as e:
print(f"Scraping error: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None
# Usage example
scraper = ClaudeScraper(api_key="your-api-key")
schema = {
"title": "string",
"price": "number",
"description": "string",
"availability": "boolean"
}
result = scraper.scrape_with_fallback("https://example.com/product", schema)
if result:
print(f"Extracted data: {result}")
else:
print("Failed to scrape data")
JavaScript/Node.js Implementation
Here's an equivalent implementation in JavaScript with similar error handling patterns:
const Anthropic = require('@anthropic-ai/sdk');
const axios = require('axios');
class ClaudeScraperError extends Error {
constructor(message) {
super(message);
this.name = 'ClaudeScraperError';
}
}
class RateLimitError extends ClaudeScraperError {
constructor(message) {
super(message);
this.name = 'RateLimitError';
}
}
class ClaudeScraper {
constructor(apiKey) {
this.client = new Anthropic({ apiKey });
this.maxRetries = 3;
this.timeout = 30000;
}
async fetchWebpage(url) {
try {
const response = await axios.get(url, {
timeout: this.timeout,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
return response.data;
} catch (error) {
if (error.code === 'ECONNABORTED') {
throw new ClaudeScraperError(`Timeout while fetching ${url}`);
}
throw new ClaudeScraperError(`Failed to fetch ${url}: ${error.message}`);
}
}
async extractDataWithClaude(htmlContent, schema) {
const maxContentLength = 100000;
let content = htmlContent.length > maxContentLength
? htmlContent.substring(0, maxContentLength)
: htmlContent;
const prompt = `Extract the following information from this HTML:
${Object.keys(schema).join(', ')}
HTML content:
${content}
Return the data as JSON matching this schema: ${JSON.stringify(schema)}
If a field is not found, use null.`;
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
try {
const message = await this.client.messages.create({
model: 'claude-3-5-sonnet-20241022',
max_tokens: 1024,
messages: [
{ role: 'user', content: prompt }
]
});
const responseText = message.content[0].text;
const data = JSON.parse(responseText);
if (this.validateData(data, schema)) {
return data;
} else {
throw new ClaudeScraperError('Validation failed for extracted data');
}
} catch (error) {
// Handle rate limit errors
if (error.status === 429) {
const waitTime = Math.pow(2, attempt) * 1000;
console.log(`Rate limit hit. Waiting ${waitTime/1000} seconds...`);
await this.sleep(waitTime);
if (attempt === this.maxRetries - 1) {
throw new RateLimitError('Max retries exceeded for rate limit');
}
continue;
}
// Handle token limit errors
if (error.status === 400 && error.message?.includes('tokens')) {
content = content.substring(0, content.length / 2);
console.log('Token limit exceeded, retrying with shorter content...');
continue;
}
// Handle JSON parsing errors
if (error instanceof SyntaxError) {
if (attempt === this.maxRetries - 1) {
throw new ClaudeScraperError('Failed to parse Claude response as JSON');
}
continue;
}
// Handle network errors
if (error.message?.includes('network') || error.message?.includes('connection')) {
await this.sleep(Math.pow(2, attempt) * 1000);
continue;
}
throw new ClaudeScraperError(`API error: ${error.message}`);
}
}
throw new ClaudeScraperError('Failed to extract data after all retries');
}
validateData(data, schema) {
if (typeof data !== 'object' || data === null) {
return false;
}
for (const field of Object.keys(schema)) {
if (!(field in data)) {
console.log(`Missing field: ${field}`);
return false;
}
}
return true;
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async scrapeWithFallback(url, schema) {
try {
const htmlContent = await this.fetchWebpage(url);
if (!htmlContent || htmlContent.length < 100) {
throw new ClaudeScraperError('Empty or invalid HTML content');
}
const data = await this.extractDataWithClaude(htmlContent, schema);
return data;
} catch (error) {
if (error instanceof RateLimitError) {
console.error(`Rate limit error: ${error.message}`);
return null;
}
console.error(`Scraping error: ${error.message}`);
return null;
}
}
}
// Usage example
(async () => {
const scraper = new ClaudeScraper('your-api-key');
const schema = {
title: 'string',
price: 'number',
description: 'string',
availability: 'boolean'
};
const result = await scraper.scrapeWithFallback('https://example.com/product', schema);
if (result) {
console.log('Extracted data:', result);
} else {
console.log('Failed to scrape data');
}
})();
Best Practices for Error Handling
1. Implement Exponential Backoff
Always use exponential backoff when retrying failed requests. This prevents overwhelming the API during temporary issues:
def exponential_backoff(attempt):
return min(2 ** attempt, 60) # Max 60 seconds
2. Monitor and Log Errors
Track error patterns to identify systemic issues:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def log_error(error_type, url, details):
logger.error(f"{error_type} for {url}: {details}")
# Send to monitoring service (e.g., Sentry, DataDog)
3. Handle Content Size Limitations
Claude API has token limits. Pre-process content to stay within bounds:
from bs4 import BeautifulSoup
def extract_main_content(html):
soup = BeautifulSoup(html, 'html.parser')
# Remove scripts, styles, and non-content elements
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
return str(soup)
4. Validate AI-Extracted Data
Since language models can hallucinate, always validate critical data. Similar to handling errors in Puppeteer, implement multi-layer validation:
def validate_product_data(data):
# Check required fields
if not data.get('title'):
return False
# Validate data types
if 'price' in data and not isinstance(data['price'], (int, float)):
return False
# Check reasonable ranges
if data.get('price', 0) < 0 or data.get('price', 0) > 1000000:
return False
return True
5. Implement Circuit Breaker Pattern
Prevent cascading failures by implementing a circuit breaker:
class CircuitBreaker:
def __init__(self, failure_threshold=5, timeout=60):
self.failure_count = 0
self.failure_threshold = failure_threshold
self.timeout = timeout
self.last_failure_time = None
self.state = 'closed' # closed, open, half-open
def call(self, func, *args, **kwargs):
if self.state == 'open':
if time.time() - self.last_failure_time > self.timeout:
self.state = 'half-open'
else:
raise ClaudeScraperError("Circuit breaker is open")
try:
result = func(*args, **kwargs)
if self.state == 'half-open':
self.state = 'closed'
self.failure_count = 0
return result
except Exception as e:
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = 'open'
raise e
Handling Specific Error Scenarios
Rate Limiting
When you encounter 429 errors, implement request queuing:
from queue import Queue
import threading
class RateLimitedScraper:
def __init__(self, requests_per_minute=50):
self.queue = Queue()
self.requests_per_minute = requests_per_minute
self.interval = 60 / requests_per_minute
def worker(self):
while True:
task = self.queue.get()
if task is None:
break
try:
task()
time.sleep(self.interval)
finally:
self.queue.task_done()
Content Extraction Failures
When Claude cannot extract structured data, combine it with traditional parsing. Just as you would handle timeouts in Puppeteer, have a fallback strategy:
def hybrid_extraction(html_content, schema):
try:
# Try Claude API first
return extract_data_with_claude(html_content, schema)
except ContentExtractionError:
# Fall back to BeautifulSoup/regex
return traditional_parsing(html_content, schema)
Network Issues
For unreliable networks, implement robust retry mechanisms similar to handling AJAX requests using Puppeteer:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def create_session():
session = requests.Session()
retry = Retry(
total=5,
backoff_factor=0.3,
status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
Monitoring and Alerting
Set up monitoring to track error rates and types:
from dataclasses import dataclass
from datetime import datetime
from typing import List
@dataclass
class ErrorMetric:
timestamp: datetime
error_type: str
url: str
message: str
class ErrorMonitor:
def __init__(self):
self.errors: List[ErrorMetric] = []
def record_error(self, error_type, url, message):
self.errors.append(ErrorMetric(
timestamp=datetime.now(),
error_type=error_type,
url=url,
message=message
))
# Alert if error rate is high
recent_errors = [e for e in self.errors
if (datetime.now() - e.timestamp).seconds < 300]
if len(recent_errors) > 10:
self.send_alert(f"High error rate: {len(recent_errors)} errors in 5 minutes")
def send_alert(self, message):
# Integrate with PagerDuty, Slack, etc.
print(f"ALERT: {message}")
Conclusion
Effective error handling in Claude API web scraping workflows requires a multi-layered approach: retry logic with exponential backoff, comprehensive error classification, data validation, and monitoring. By implementing these patterns, you'll build resilient scrapers that gracefully handle the unpredictable nature of web scraping while maximizing data quality and API efficiency.
Remember to always validate AI-extracted data, implement rate limiting, and have fallback strategies for when automated extraction fails. These practices ensure your scraping operations remain reliable and cost-effective over time.