How do you deal with errors when loading HTML in Cheerio?

When using Cheerio for web scraping with Node.js, proper error handling is essential for building reliable scrapers. Errors can occur at multiple levels: network requests, HTML parsing, and element selection. This guide covers comprehensive error handling strategies for Cheerio-based web scrapers.

Understanding Error Types

Cheerio-related errors typically fall into these categories:

  1. Network errors - Connection timeouts, DNS failures, server errors
  2. HTTP errors - 404, 500, rate limiting (429), authentication (401/403)
  3. Data errors - Invalid HTML, empty responses, encoding issues
  4. Parsing errors - Malformed data passed to cheerio.load()
  5. Selector errors - Invalid CSS selectors or missing elements

HTTP Request Error Handling

Axios with Comprehensive Error Handling

const axios = require('axios');
const cheerio = require('cheerio');

async function fetchAndLoad(url, options = {}) {
  try {
    const response = await axios.get(url, {
      timeout: options.timeout || 10000,
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
        ...options.headers
      },
      maxRedirects: 5,
      validateStatus: (status) => status < 500 // Accept 4xx but not 5xx
    });

    // Validate response has HTML content
    if (!response.data || typeof response.data !== 'string') {
      throw new Error('Invalid response: No HTML content received');
    }

    const $ = cheerio.load(response.data);
    return $;

  } catch (error) {
    if (error.code === 'ECONNABORTED') {
      throw new Error(`Request timeout after ${options.timeout || 10000}ms`);
    } else if (error.response) {
      // Server responded with error status
      throw new Error(`HTTP ${error.response.status}: ${error.response.statusText}`);
    } else if (error.request) {
      // Network error (no response)
      throw new Error(`Network error: ${error.message}`);
    } else {
      // Other errors (parsing, etc.)
      throw new Error(`Request failed: ${error.message}`);
    }
  }
}

// Usage with error handling
async function scrapeWithRetry(url, maxRetries = 3) {
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      const $ = await fetchAndLoad(url, { timeout: 15000 });

      // Validate page loaded correctly
      if ($('body').length === 0) {
        throw new Error('Invalid HTML: No body element found');
      }

      return $;
    } catch (error) {
      console.warn(`Attempt ${attempt} failed:`, error.message);

      if (attempt === maxRetries) {
        throw new Error(`Failed after ${maxRetries} attempts: ${error.message}`);
      }

      // Exponential backoff
      await new Promise(resolve => setTimeout(resolve, Math.pow(2, attempt) * 1000));
    }
  }
}

Fetch API with Advanced Error Handling

const fetch = require('node-fetch');
const cheerio = require('cheerio');
const { AbortController } = require('abort-controller');

async function fetchAndLoad(url, options = {}) {
  const controller = new AbortController();
  const timeout = options.timeout || 10000;

  // Set up timeout
  const timeoutId = setTimeout(() => controller.abort(), timeout);

  try {
    const response = await fetch(url, {
      signal: controller.signal,
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        ...options.headers
      },
      redirect: 'follow',
      ...options
    });

    clearTimeout(timeoutId);

    // Check response status
    if (!response.ok) {
      const errorBody = await response.text().catch(() => 'Unknown error');
      throw new Error(`HTTP ${response.status}: ${response.statusText} - ${errorBody.substring(0, 200)}`);
    }

    // Validate content type
    const contentType = response.headers.get('content-type');
    if (contentType && !contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
      console.warn(`Warning: Unexpected content type: ${contentType}`);
    }

    const html = await response.text();

    // Validate HTML content
    if (!html || html.trim().length === 0) {
      throw new Error('Empty response received');
    }

    return cheerio.load(html);

  } catch (error) {
    clearTimeout(timeoutId);

    if (error.name === 'AbortError') {
      throw new Error(`Request timeout after ${timeout}ms`);
    } else if (error.code === 'ENOTFOUND') {
      throw new Error(`DNS lookup failed for ${url}`);
    } else if (error.code === 'ECONNREFUSED') {
      throw new Error(`Connection refused to ${url}`);
    } else {
      throw error;
    }
  }
}

Cheerio-Specific Error Handling

Safe HTML Loading

function safeCheerioLoad(html, options = {}) {
  try {
    // Validate input
    if (html === null || html === undefined) {
      throw new Error('HTML content is null or undefined');
    }

    if (typeof html !== 'string' && !Buffer.isBuffer(html)) {
      throw new Error('HTML must be a string or Buffer');
    }

    // Load with error handling
    const $ = cheerio.load(html, {
      xmlMode: false,
      decodeEntities: true,
      ...options
    });

    // Basic validation
    if (typeof $ !== 'function') {
      throw new Error('Failed to create Cheerio instance');
    }

    return $;

  } catch (error) {
    throw new Error(`Cheerio load failed: ${error.message}`);
  }
}

Element Existence Validation

function safeExtractData($, selectors) {
  const results = {};

  for (const [key, selector] of Object.entries(selectors)) {
    try {
      const elements = $(selector);

      if (elements.length === 0) {
        console.warn(`No elements found for selector: ${selector}`);
        results[key] = null;
        continue;
      }

      // Extract text with fallback
      const text = elements.first().text().trim();
      results[key] = text || null;

    } catch (error) {
      console.error(`Error extracting ${key} with selector ${selector}:`, error.message);
      results[key] = null;
    }
  }

  return results;
}

// Usage example
const $ = cheerio.load(html);
const data = safeExtractData($, {
  title: 'h1',
  description: '.description',
  price: '.price',
  availability: '.stock-status'
});

Production-Ready Error Handling Pattern

const axios = require('axios');
const cheerio = require('cheerio');

class CheerioScraper {
  constructor(options = {}) {
    this.options = {
      timeout: 15000,
      maxRetries: 3,
      retryDelay: 1000,
      userAgent: 'Mozilla/5.0 (compatible; WebScraper/1.0)',
      ...options
    };
  }

  async scrape(url, selectors) {
    let lastError;

    for (let attempt = 1; attempt <= this.options.maxRetries; attempt++) {
      try {
        // Fetch HTML
        const $ = await this.fetchHTML(url);

        // Validate page structure
        await this.validatePage($);

        // Extract data
        const data = this.extractData($, selectors);

        return { success: true, data, attempt };

      } catch (error) {
        lastError = error;
        console.warn(`Attempt ${attempt}/${this.options.maxRetries} failed:`, error.message);

        // Don't retry certain errors
        if (this.isNonRetryableError(error)) {
          break;
        }

        if (attempt < this.options.maxRetries) {
          await this.delay(this.options.retryDelay * attempt);
        }
      }
    }

    return { 
      success: false, 
      error: lastError.message, 
      attempts: this.options.maxRetries 
    };
  }

  async fetchHTML(url) {
    const response = await axios.get(url, {
      timeout: this.options.timeout,
      headers: { 'User-Agent': this.options.userAgent },
      validateStatus: (status) => status < 500
    });

    if (!response.data) {
      throw new Error('No data received from server');
    }

    return cheerio.load(response.data);
  }

  async validatePage($) {
    // Check if page loaded properly
    if ($('body').length === 0) {
      throw new Error('Invalid HTML structure: missing body element');
    }

    // Check for common error pages
    if ($('title').text().toLowerCase().includes('error') ||
        $('.error, .not-found').length > 0) {
      throw new Error('Error page detected');
    }
  }

  extractData($, selectors) {
    const data = {};

    for (const [key, config] of Object.entries(selectors)) {
      try {
        const selector = typeof config === 'string' ? config : config.selector;
        const required = typeof config === 'object' ? config.required : false;

        const element = $(selector);

        if (element.length === 0) {
          if (required) {
            throw new Error(`Required element not found: ${selector}`);
          }
          data[key] = null;
          continue;
        }

        data[key] = element.first().text().trim() || null;

      } catch (error) {
        if (typeof config === 'object' && config.required) {
          throw error;
        }
        console.warn(`Failed to extract ${key}:`, error.message);
        data[key] = null;
      }
    }

    return data;
  }

  isNonRetryableError(error) {
    const nonRetryablePatterns = [
      /404/i,
      /403/i,
      /401/i,
      /Invalid HTML structure/i,
      /Required element not found/i
    ];

    return nonRetryablePatterns.some(pattern => pattern.test(error.message));
  }

  delay(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

// Usage
const scraper = new CheerioScraper({ timeout: 20000, maxRetries: 2 });

const result = await scraper.scrape('https://example.com', {
  title: 'h1',
  price: { selector: '.price', required: true },
  description: '.description'
});

if (result.success) {
  console.log('Scraped data:', result.data);
} else {
  console.error('Scraping failed:', result.error);
}

Best Practices Summary

  1. Always use timeouts to prevent hanging requests
  2. Implement retry logic with exponential backoff for transient errors
  3. Validate response data before passing to Cheerio
  4. Check element existence before manipulation
  5. Use specific error messages for easier debugging
  6. Log warnings for non-critical failures
  7. Handle rate limiting with appropriate delays
  8. Validate HTML structure to detect error pages

Remember that robust error handling is crucial for production web scrapers, as websites can change unexpectedly and network conditions vary.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon