When using Cheerio for web scraping with Node.js, proper error handling is essential for building reliable scrapers. Errors can occur at multiple levels: network requests, HTML parsing, and element selection. This guide covers comprehensive error handling strategies for Cheerio-based web scrapers.
Understanding Error Types
Cheerio-related errors typically fall into these categories:
- Network errors - Connection timeouts, DNS failures, server errors
- HTTP errors - 404, 500, rate limiting (429), authentication (401/403)
- Data errors - Invalid HTML, empty responses, encoding issues
- Parsing errors - Malformed data passed to
cheerio.load()
- Selector errors - Invalid CSS selectors or missing elements
HTTP Request Error Handling
Axios with Comprehensive Error Handling
const axios = require('axios');
const cheerio = require('cheerio');
async function fetchAndLoad(url, options = {}) {
try {
const response = await axios.get(url, {
timeout: options.timeout || 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
...options.headers
},
maxRedirects: 5,
validateStatus: (status) => status < 500 // Accept 4xx but not 5xx
});
// Validate response has HTML content
if (!response.data || typeof response.data !== 'string') {
throw new Error('Invalid response: No HTML content received');
}
const $ = cheerio.load(response.data);
return $;
} catch (error) {
if (error.code === 'ECONNABORTED') {
throw new Error(`Request timeout after ${options.timeout || 10000}ms`);
} else if (error.response) {
// Server responded with error status
throw new Error(`HTTP ${error.response.status}: ${error.response.statusText}`);
} else if (error.request) {
// Network error (no response)
throw new Error(`Network error: ${error.message}`);
} else {
// Other errors (parsing, etc.)
throw new Error(`Request failed: ${error.message}`);
}
}
}
// Usage with error handling
async function scrapeWithRetry(url, maxRetries = 3) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const $ = await fetchAndLoad(url, { timeout: 15000 });
// Validate page loaded correctly
if ($('body').length === 0) {
throw new Error('Invalid HTML: No body element found');
}
return $;
} catch (error) {
console.warn(`Attempt ${attempt} failed:`, error.message);
if (attempt === maxRetries) {
throw new Error(`Failed after ${maxRetries} attempts: ${error.message}`);
}
// Exponential backoff
await new Promise(resolve => setTimeout(resolve, Math.pow(2, attempt) * 1000));
}
}
}
Fetch API with Advanced Error Handling
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const { AbortController } = require('abort-controller');
async function fetchAndLoad(url, options = {}) {
const controller = new AbortController();
const timeout = options.timeout || 10000;
// Set up timeout
const timeoutId = setTimeout(() => controller.abort(), timeout);
try {
const response = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
...options.headers
},
redirect: 'follow',
...options
});
clearTimeout(timeoutId);
// Check response status
if (!response.ok) {
const errorBody = await response.text().catch(() => 'Unknown error');
throw new Error(`HTTP ${response.status}: ${response.statusText} - ${errorBody.substring(0, 200)}`);
}
// Validate content type
const contentType = response.headers.get('content-type');
if (contentType && !contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
console.warn(`Warning: Unexpected content type: ${contentType}`);
}
const html = await response.text();
// Validate HTML content
if (!html || html.trim().length === 0) {
throw new Error('Empty response received');
}
return cheerio.load(html);
} catch (error) {
clearTimeout(timeoutId);
if (error.name === 'AbortError') {
throw new Error(`Request timeout after ${timeout}ms`);
} else if (error.code === 'ENOTFOUND') {
throw new Error(`DNS lookup failed for ${url}`);
} else if (error.code === 'ECONNREFUSED') {
throw new Error(`Connection refused to ${url}`);
} else {
throw error;
}
}
}
Cheerio-Specific Error Handling
Safe HTML Loading
function safeCheerioLoad(html, options = {}) {
try {
// Validate input
if (html === null || html === undefined) {
throw new Error('HTML content is null or undefined');
}
if (typeof html !== 'string' && !Buffer.isBuffer(html)) {
throw new Error('HTML must be a string or Buffer');
}
// Load with error handling
const $ = cheerio.load(html, {
xmlMode: false,
decodeEntities: true,
...options
});
// Basic validation
if (typeof $ !== 'function') {
throw new Error('Failed to create Cheerio instance');
}
return $;
} catch (error) {
throw new Error(`Cheerio load failed: ${error.message}`);
}
}
Element Existence Validation
function safeExtractData($, selectors) {
const results = {};
for (const [key, selector] of Object.entries(selectors)) {
try {
const elements = $(selector);
if (elements.length === 0) {
console.warn(`No elements found for selector: ${selector}`);
results[key] = null;
continue;
}
// Extract text with fallback
const text = elements.first().text().trim();
results[key] = text || null;
} catch (error) {
console.error(`Error extracting ${key} with selector ${selector}:`, error.message);
results[key] = null;
}
}
return results;
}
// Usage example
const $ = cheerio.load(html);
const data = safeExtractData($, {
title: 'h1',
description: '.description',
price: '.price',
availability: '.stock-status'
});
Production-Ready Error Handling Pattern
const axios = require('axios');
const cheerio = require('cheerio');
class CheerioScraper {
constructor(options = {}) {
this.options = {
timeout: 15000,
maxRetries: 3,
retryDelay: 1000,
userAgent: 'Mozilla/5.0 (compatible; WebScraper/1.0)',
...options
};
}
async scrape(url, selectors) {
let lastError;
for (let attempt = 1; attempt <= this.options.maxRetries; attempt++) {
try {
// Fetch HTML
const $ = await this.fetchHTML(url);
// Validate page structure
await this.validatePage($);
// Extract data
const data = this.extractData($, selectors);
return { success: true, data, attempt };
} catch (error) {
lastError = error;
console.warn(`Attempt ${attempt}/${this.options.maxRetries} failed:`, error.message);
// Don't retry certain errors
if (this.isNonRetryableError(error)) {
break;
}
if (attempt < this.options.maxRetries) {
await this.delay(this.options.retryDelay * attempt);
}
}
}
return {
success: false,
error: lastError.message,
attempts: this.options.maxRetries
};
}
async fetchHTML(url) {
const response = await axios.get(url, {
timeout: this.options.timeout,
headers: { 'User-Agent': this.options.userAgent },
validateStatus: (status) => status < 500
});
if (!response.data) {
throw new Error('No data received from server');
}
return cheerio.load(response.data);
}
async validatePage($) {
// Check if page loaded properly
if ($('body').length === 0) {
throw new Error('Invalid HTML structure: missing body element');
}
// Check for common error pages
if ($('title').text().toLowerCase().includes('error') ||
$('.error, .not-found').length > 0) {
throw new Error('Error page detected');
}
}
extractData($, selectors) {
const data = {};
for (const [key, config] of Object.entries(selectors)) {
try {
const selector = typeof config === 'string' ? config : config.selector;
const required = typeof config === 'object' ? config.required : false;
const element = $(selector);
if (element.length === 0) {
if (required) {
throw new Error(`Required element not found: ${selector}`);
}
data[key] = null;
continue;
}
data[key] = element.first().text().trim() || null;
} catch (error) {
if (typeof config === 'object' && config.required) {
throw error;
}
console.warn(`Failed to extract ${key}:`, error.message);
data[key] = null;
}
}
return data;
}
isNonRetryableError(error) {
const nonRetryablePatterns = [
/404/i,
/403/i,
/401/i,
/Invalid HTML structure/i,
/Required element not found/i
];
return nonRetryablePatterns.some(pattern => pattern.test(error.message));
}
delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage
const scraper = new CheerioScraper({ timeout: 20000, maxRetries: 2 });
const result = await scraper.scrape('https://example.com', {
title: 'h1',
price: { selector: '.price', required: true },
description: '.description'
});
if (result.success) {
console.log('Scraped data:', result.data);
} else {
console.error('Scraping failed:', result.error);
}
Best Practices Summary
- Always use timeouts to prevent hanging requests
- Implement retry logic with exponential backoff for transient errors
- Validate response data before passing to Cheerio
- Check element existence before manipulation
- Use specific error messages for easier debugging
- Log warnings for non-critical failures
- Handle rate limiting with appropriate delays
- Validate HTML structure to detect error pages
Remember that robust error handling is crucial for production web scrapers, as websites can change unexpectedly and network conditions vary.