What is the Best Way to Handle Errors and Retries in JavaScript Scraping?

Error handling and retry mechanisms are crucial components of robust JavaScript web scraping applications. Websites can be unpredictable, network connections may fail, and servers might return temporary errors. This guide covers comprehensive strategies for handling errors and implementing retry logic in JavaScript scraping projects.

Understanding Common Scraping Errors

Before implementing error handling, it's important to understand the types of errors you'll encounter:

Network-Related Errors

Connection timeouts
DNS resolution failures
HTTP status errors (404, 500, 502, 503)
SSL/TLS certificate issues

Browser-Related Errors

Page load failures
JavaScript execution errors
Element not found exceptions
Navigation timeouts

Server-Side Errors

Rate limiting (429 status)
Temporary server overload
Maintenance mode responses
Authentication failures

Basic Error Handling Patterns

Try-Catch Blocks with Async/Await

The foundation of error handling in JavaScript scraping is proper use of try-catch blocks:

async function scrapePage(url) {
    try {
        const response = await fetch(url);

        if (!response.ok) {
            throw new Error(`HTTP error! status: ${response.status}`);
        }

        const html = await response.text();
        return parseData(html);
    } catch (error) {
        console.error(`Failed to scrape ${url}:`, error.message);
        throw error; // Re-throw to allow caller to handle
    }
}

Error Classification and Handling

Different errors require different handling strategies:

class ScrapingError extends Error {
    constructor(message, type, statusCode = null) {
        super(message);
        this.name = 'ScrapingError';
        this.type = type;
        this.statusCode = statusCode;
    }
}

async function handleResponse(response) {
    if (response.status >= 200 && response.status < 300) {
        return response;
    }

    let errorType;
    if (response.status === 429) {
        errorType = 'RATE_LIMITED';
    } else if (response.status >= 500) {
        errorType = 'SERVER_ERROR';
    } else if (response.status === 404) {
        errorType = 'NOT_FOUND';
    } else {
        errorType = 'CLIENT_ERROR';
    }

    throw new ScrapingError(
        `Request failed with status ${response.status}`,
        errorType,
        response.status
    );
}

Implementing Retry Logic

Exponential Backoff Strategy

Exponential backoff prevents overwhelming servers and improves success rates:

async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {
    let lastError;

    for (let attempt = 0; attempt <= maxRetries; attempt++) {
        try {
            return await fn();
        } catch (error) {
            lastError = error;

            // Don't retry on certain error types
            if (error.type === 'NOT_FOUND' || error.type === 'CLIENT_ERROR') {
                throw error;
            }

            if (attempt === maxRetries) {
                break;
            }

            // Calculate delay with exponential backoff
            const delay = baseDelay * Math.pow(2, attempt);
            const jitter = Math.random() * 0.1 * delay; // Add jitter

            console.log(`Attempt ${attempt + 1} failed, retrying in ${delay + jitter}ms...`);
            await sleep(delay + jitter);
        }
    }

    throw lastError;
}

function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

Advanced Retry Configuration

Create a configurable retry system for different scenarios:

class RetryConfig {
    constructor({
        maxRetries = 3,
        baseDelay = 1000,
        maxDelay = 30000,
        retryCondition = (error) => true,
        onRetry = (error, attempt) => {}
    }) {
        this.maxRetries = maxRetries;
        this.baseDelay = baseDelay;
        this.maxDelay = maxDelay;
        this.retryCondition = retryCondition;
        this.onRetry = onRetry;
    }
}

async function retryOperation(operation, config = new RetryConfig({})) {
    let lastError;

    for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
        try {
            return await operation();
        } catch (error) {
            lastError = error;

            if (attempt === config.maxRetries || !config.retryCondition(error)) {
                break;
            }

            const delay = Math.min(
                config.baseDelay * Math.pow(2, attempt),
                config.maxDelay
            );

            config.onRetry(error, attempt + 1);
            await sleep(delay);
        }
    }

    throw lastError;
}

Puppeteer Error Handling

Comprehensive Puppeteer Error Management

When using Puppeteer for scraping, you need to handle both browser and page-level errors:

const puppeteer = require('puppeteer');

async function scrapePuppeteer(url) {
    let browser;

    try {
        browser = await puppeteer.launch({
            headless: true,
            args: ['--no-sandbox', '--disable-setuid-sandbox']
        });

        const page = await browser.newPage();

        // Set timeouts
        page.setDefaultNavigationTimeout(30000);
        page.setDefaultTimeout(30000);

        // Handle page errors
        page.on('error', (error) => {
            console.error('Page error:', error);
        });

        page.on('pageerror', (error) => {
            console.error('Page script error:', error);
        });

        await page.goto(url, { waitUntil: 'networkidle2' });

        // Wait for content with retry
        const content = await retryOperation(async () => {
            return await page.evaluate(() => {
                const element = document.querySelector('.content');
                if (!element) {
                    throw new Error('Content element not found');
                }
                return element.textContent;
            });
        }, new RetryConfig({
            maxRetries: 3,
            baseDelay: 1000
        }));

        return content;

    } catch (error) {
        if (error.name === 'TimeoutError') {
            throw new ScrapingError('Page load timeout', 'TIMEOUT');
        } else if (error.message.includes('net::ERR_')) {
            throw new ScrapingError('Network error', 'NETWORK_ERROR');
        } else {
            throw error;
        }
    } finally {
        if (browser) {
            await browser.close();
        }
    }
}

For more detailed Puppeteer error handling techniques, check out our guide on how to handle errors in Puppeteer.

Handling Navigation Errors

Navigation failures are common in web scraping. Here's how to handle them robustly:

async function navigateWithRetry(page, url, options = {}) {
    const config = new RetryConfig({
        maxRetries: 3,
        retryCondition: (error) => {
            return error.name === 'TimeoutError' || 
                   error.message.includes('net::ERR_');
        }
    });

    return await retryOperation(async () => {
        try {
            await page.goto(url, {
                waitUntil: 'networkidle2',
                timeout: 30000,
                ...options
            });
        } catch (error) {
            if (error.name === 'TimeoutError') {
                // Try with different wait condition
                await page.goto(url, {
                    waitUntil: 'domcontentloaded',
                    timeout: 15000
                });
            } else {
                throw error;
            }
        }
    }, config);
}

Axios and HTTP Client Error Handling

Axios Interceptors for Global Error Handling

Use Axios interceptors to handle errors consistently across your application:

const axios = require('axios');

const client = axios.create({
    timeout: 30000,
    headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)'
    }
});

// Request interceptor
client.interceptors.request.use(
    (config) => {
        console.log(`Making request to: ${config.url}`);
        return config;
    },
    (error) => Promise.reject(error)
);

// Response interceptor
client.interceptors.response.use(
    (response) => response,
    (error) => {
        if (error.response) {
            // Server responded with error status
            const { status, data } = error.response;
            throw new ScrapingError(
                `HTTP ${status}: ${data.message || 'Request failed'}`,
                status >= 500 ? 'SERVER_ERROR' : 'CLIENT_ERROR',
                status
            );
        } else if (error.request) {
            // Request was made but no response received
            throw new ScrapingError(
                'No response received from server',
                'NETWORK_ERROR'
            );
        } else {
            // Request setup error
            throw new ScrapingError(
                `Request setup error: ${error.message}`,
                'REQUEST_ERROR'
            );
        }
    }
);

Rate Limiting and Retry Headers

Handle rate limiting properly by respecting server-provided retry headers:

async function makeRequestWithRateLimit(url) {
    const config = new RetryConfig({
        maxRetries: 5,
        retryCondition: (error) => {
            return error.statusCode === 429 || 
                   (error.statusCode >= 500 && error.statusCode < 600);
        },
        onRetry: (error, attempt) => {
            console.log(`Retry attempt ${attempt} for ${url}`);
        }
    });

    return await retryOperation(async () => {
        try {
            return await client.get(url);
        } catch (error) {
            if (error.statusCode === 429) {
                // Check for Retry-After header
                const retryAfter = error.response?.headers['retry-after'];
                if (retryAfter) {
                    const delay = parseInt(retryAfter) * 1000;
                    console.log(`Rate limited, waiting ${delay}ms`);
                    await sleep(delay);
                }
            }
            throw error;
        }
    }, config);
}

Circuit Breaker Pattern

For production applications, implement a circuit breaker to prevent cascading failures:

class CircuitBreaker {
    constructor(threshold = 5, timeout = 60000) {
        this.threshold = threshold;
        this.timeout = timeout;
        this.failureCount = 0;
        this.lastFailureTime = null;
        this.state = 'CLOSED'; // CLOSED, OPEN, HALF_OPEN
    }

    async execute(operation) {
        if (this.state === 'OPEN') {
            if (Date.now() - this.lastFailureTime >= this.timeout) {
                this.state = 'HALF_OPEN';
            } else {
                throw new Error('Circuit breaker is OPEN');
            }
        }

        try {
            const result = await operation();
            this.onSuccess();
            return result;
        } catch (error) {
            this.onFailure();
            throw error;
        }
    }

    onSuccess() {
        this.failureCount = 0;
        this.state = 'CLOSED';
    }

    onFailure() {
        this.failureCount++;
        this.lastFailureTime = Date.now();

        if (this.failureCount >= this.threshold) {
            this.state = 'OPEN';
        }
    }
}

Error Monitoring and Logging

Structured Error Logging

Implement comprehensive logging for debugging and monitoring:

const winston = require('winston');

const logger = winston.createLogger({
    level: 'info',
    format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json()
    ),
    transports: [
        new winston.transports.Console(),
        new winston.transports.File({ filename: 'scraping-errors.log' })
    ]
});

async function scrapeWithLogging(url) {
    const startTime = Date.now();

    try {
        logger.info('Starting scrape', { url, timestamp: new Date().toISOString() });

        const result = await retryOperation(
            () => scrapePage(url),
            new RetryConfig({
                onRetry: (error, attempt) => {
                    logger.warn('Retry attempt', {
                        url,
                        attempt,
                        error: error.message,
                        errorType: error.type
                    });
                }
            })
        );

        const duration = Date.now() - startTime;
        logger.info('Scrape completed', { url, duration, success: true });

        return result;
    } catch (error) {
        const duration = Date.now() - startTime;
        logger.error('Scrape failed', {
            url,
            duration,
            error: error.message,
            errorType: error.type,
            stack: error.stack
        });
        throw error;
    }
}

Best Practices Summary

Classify Errors: Differentiate between retryable and non-retryable errors
Use Exponential Backoff: Prevent overwhelming servers with immediate retries
Set Reasonable Timeouts: Balance between patience and efficiency
Implement Circuit Breakers: Prevent cascading failures in production
Log Comprehensively: Include context for debugging and monitoring
Respect Rate Limits: Honor server-provided retry headers
Handle Resource Cleanup: Always close browsers and connections in finally blocks

When working with complex navigation scenarios, you might also want to review our guide on how to handle timeouts in Puppeteer for additional timeout management strategies.

By implementing these error handling and retry strategies, you'll build more resilient JavaScript scraping applications that can handle the unpredictable nature of web scraping while maintaining good citizenship on the web.

Table of contents