What is the Best Way to Handle Errors and Retries in JavaScript Scraping?
Error handling and retry mechanisms are crucial components of robust JavaScript web scraping applications. Websites can be unpredictable, network connections may fail, and servers might return temporary errors. This guide covers comprehensive strategies for handling errors and implementing retry logic in JavaScript scraping projects.
Understanding Common Scraping Errors
Before implementing error handling, it's important to understand the types of errors you'll encounter:
Network-Related Errors
- Connection timeouts
- DNS resolution failures
- HTTP status errors (404, 500, 502, 503)
- SSL/TLS certificate issues
Browser-Related Errors
- Page load failures
- JavaScript execution errors
- Element not found exceptions
- Navigation timeouts
Server-Side Errors
- Rate limiting (429 status)
- Temporary server overload
- Maintenance mode responses
- Authentication failures
Basic Error Handling Patterns
Try-Catch Blocks with Async/Await
The foundation of error handling in JavaScript scraping is proper use of try-catch blocks:
async function scrapePage(url) {
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const html = await response.text();
return parseData(html);
} catch (error) {
console.error(`Failed to scrape ${url}:`, error.message);
throw error; // Re-throw to allow caller to handle
}
}
Error Classification and Handling
Different errors require different handling strategies:
class ScrapingError extends Error {
constructor(message, type, statusCode = null) {
super(message);
this.name = 'ScrapingError';
this.type = type;
this.statusCode = statusCode;
}
}
async function handleResponse(response) {
if (response.status >= 200 && response.status < 300) {
return response;
}
let errorType;
if (response.status === 429) {
errorType = 'RATE_LIMITED';
} else if (response.status >= 500) {
errorType = 'SERVER_ERROR';
} else if (response.status === 404) {
errorType = 'NOT_FOUND';
} else {
errorType = 'CLIENT_ERROR';
}
throw new ScrapingError(
`Request failed with status ${response.status}`,
errorType,
response.status
);
}
Implementing Retry Logic
Exponential Backoff Strategy
Exponential backoff prevents overwhelming servers and improves success rates:
async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {
let lastError;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error;
// Don't retry on certain error types
if (error.type === 'NOT_FOUND' || error.type === 'CLIENT_ERROR') {
throw error;
}
if (attempt === maxRetries) {
break;
}
// Calculate delay with exponential backoff
const delay = baseDelay * Math.pow(2, attempt);
const jitter = Math.random() * 0.1 * delay; // Add jitter
console.log(`Attempt ${attempt + 1} failed, retrying in ${delay + jitter}ms...`);
await sleep(delay + jitter);
}
}
throw lastError;
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
Advanced Retry Configuration
Create a configurable retry system for different scenarios:
class RetryConfig {
constructor({
maxRetries = 3,
baseDelay = 1000,
maxDelay = 30000,
retryCondition = (error) => true,
onRetry = (error, attempt) => {}
}) {
this.maxRetries = maxRetries;
this.baseDelay = baseDelay;
this.maxDelay = maxDelay;
this.retryCondition = retryCondition;
this.onRetry = onRetry;
}
}
async function retryOperation(operation, config = new RetryConfig({})) {
let lastError;
for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error;
if (attempt === config.maxRetries || !config.retryCondition(error)) {
break;
}
const delay = Math.min(
config.baseDelay * Math.pow(2, attempt),
config.maxDelay
);
config.onRetry(error, attempt + 1);
await sleep(delay);
}
}
throw lastError;
}
Puppeteer Error Handling
Comprehensive Puppeteer Error Management
When using Puppeteer for scraping, you need to handle both browser and page-level errors:
const puppeteer = require('puppeteer');
async function scrapePuppeteer(url) {
let browser;
try {
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
// Set timeouts
page.setDefaultNavigationTimeout(30000);
page.setDefaultTimeout(30000);
// Handle page errors
page.on('error', (error) => {
console.error('Page error:', error);
});
page.on('pageerror', (error) => {
console.error('Page script error:', error);
});
await page.goto(url, { waitUntil: 'networkidle2' });
// Wait for content with retry
const content = await retryOperation(async () => {
return await page.evaluate(() => {
const element = document.querySelector('.content');
if (!element) {
throw new Error('Content element not found');
}
return element.textContent;
});
}, new RetryConfig({
maxRetries: 3,
baseDelay: 1000
}));
return content;
} catch (error) {
if (error.name === 'TimeoutError') {
throw new ScrapingError('Page load timeout', 'TIMEOUT');
} else if (error.message.includes('net::ERR_')) {
throw new ScrapingError('Network error', 'NETWORK_ERROR');
} else {
throw error;
}
} finally {
if (browser) {
await browser.close();
}
}
}
For more detailed Puppeteer error handling techniques, check out our guide on how to handle errors in Puppeteer.
Handling Navigation Errors
Navigation failures are common in web scraping. Here's how to handle them robustly:
async function navigateWithRetry(page, url, options = {}) {
const config = new RetryConfig({
maxRetries: 3,
retryCondition: (error) => {
return error.name === 'TimeoutError' ||
error.message.includes('net::ERR_');
}
});
return await retryOperation(async () => {
try {
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000,
...options
});
} catch (error) {
if (error.name === 'TimeoutError') {
// Try with different wait condition
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 15000
});
} else {
throw error;
}
}
}, config);
}
Axios and HTTP Client Error Handling
Axios Interceptors for Global Error Handling
Use Axios interceptors to handle errors consistently across your application:
const axios = require('axios');
const client = axios.create({
timeout: 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)'
}
});
// Request interceptor
client.interceptors.request.use(
(config) => {
console.log(`Making request to: ${config.url}`);
return config;
},
(error) => Promise.reject(error)
);
// Response interceptor
client.interceptors.response.use(
(response) => response,
(error) => {
if (error.response) {
// Server responded with error status
const { status, data } = error.response;
throw new ScrapingError(
`HTTP ${status}: ${data.message || 'Request failed'}`,
status >= 500 ? 'SERVER_ERROR' : 'CLIENT_ERROR',
status
);
} else if (error.request) {
// Request was made but no response received
throw new ScrapingError(
'No response received from server',
'NETWORK_ERROR'
);
} else {
// Request setup error
throw new ScrapingError(
`Request setup error: ${error.message}`,
'REQUEST_ERROR'
);
}
}
);
Rate Limiting and Retry Headers
Handle rate limiting properly by respecting server-provided retry headers:
async function makeRequestWithRateLimit(url) {
const config = new RetryConfig({
maxRetries: 5,
retryCondition: (error) => {
return error.statusCode === 429 ||
(error.statusCode >= 500 && error.statusCode < 600);
},
onRetry: (error, attempt) => {
console.log(`Retry attempt ${attempt} for ${url}`);
}
});
return await retryOperation(async () => {
try {
return await client.get(url);
} catch (error) {
if (error.statusCode === 429) {
// Check for Retry-After header
const retryAfter = error.response?.headers['retry-after'];
if (retryAfter) {
const delay = parseInt(retryAfter) * 1000;
console.log(`Rate limited, waiting ${delay}ms`);
await sleep(delay);
}
}
throw error;
}
}, config);
}
Circuit Breaker Pattern
For production applications, implement a circuit breaker to prevent cascading failures:
class CircuitBreaker {
constructor(threshold = 5, timeout = 60000) {
this.threshold = threshold;
this.timeout = timeout;
this.failureCount = 0;
this.lastFailureTime = null;
this.state = 'CLOSED'; // CLOSED, OPEN, HALF_OPEN
}
async execute(operation) {
if (this.state === 'OPEN') {
if (Date.now() - this.lastFailureTime >= this.timeout) {
this.state = 'HALF_OPEN';
} else {
throw new Error('Circuit breaker is OPEN');
}
}
try {
const result = await operation();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
onSuccess() {
this.failureCount = 0;
this.state = 'CLOSED';
}
onFailure() {
this.failureCount++;
this.lastFailureTime = Date.now();
if (this.failureCount >= this.threshold) {
this.state = 'OPEN';
}
}
}
Error Monitoring and Logging
Structured Error Logging
Implement comprehensive logging for debugging and monitoring:
const winston = require('winston');
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
transports: [
new winston.transports.Console(),
new winston.transports.File({ filename: 'scraping-errors.log' })
]
});
async function scrapeWithLogging(url) {
const startTime = Date.now();
try {
logger.info('Starting scrape', { url, timestamp: new Date().toISOString() });
const result = await retryOperation(
() => scrapePage(url),
new RetryConfig({
onRetry: (error, attempt) => {
logger.warn('Retry attempt', {
url,
attempt,
error: error.message,
errorType: error.type
});
}
})
);
const duration = Date.now() - startTime;
logger.info('Scrape completed', { url, duration, success: true });
return result;
} catch (error) {
const duration = Date.now() - startTime;
logger.error('Scrape failed', {
url,
duration,
error: error.message,
errorType: error.type,
stack: error.stack
});
throw error;
}
}
Best Practices Summary
- Classify Errors: Differentiate between retryable and non-retryable errors
- Use Exponential Backoff: Prevent overwhelming servers with immediate retries
- Set Reasonable Timeouts: Balance between patience and efficiency
- Implement Circuit Breakers: Prevent cascading failures in production
- Log Comprehensively: Include context for debugging and monitoring
- Respect Rate Limits: Honor server-provided retry headers
- Handle Resource Cleanup: Always close browsers and connections in finally blocks
When working with complex navigation scenarios, you might also want to review our guide on how to handle timeouts in Puppeteer for additional timeout management strategies.
By implementing these error handling and retry strategies, you'll build more resilient JavaScript scraping applications that can handle the unpredictable nature of web scraping while maintaining good citizenship on the web.