How do you use Cheerio with asynchronous JavaScript code?

Cheerio is a fast, flexible server-side implementation of jQuery designed for HTML parsing and manipulation. When working with Cheerio in asynchronous JavaScript environments, you'll typically combine it with HTTP clients to fetch and process web content dynamically.

Installation

First, install the required packages:

npm install cheerio axios

Basic Async/Await Usage

Here's a foundational example of using Cheerio with async/await to scrape web content:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeWebsite(url) {
    try {
        // Fetch HTML content
        const response = await axios.get(url);
        const $ = cheerio.load(response.data);

        // Extract data using jQuery-like selectors
        const links = [];
        $('a').each((index, element) => {
            const link = {
                text: $(element).text().trim(),
                href: $(element).attr('href'),
                title: $(element).attr('title') || null
            };
            if (link.href) links.push(link);
        });

        return links;
    } catch (error) {
        console.error('Scraping failed:', error.message);
        throw error;
    }
}

// Usage
scrapeWebsite('https://example.com')
    .then(links => console.log('Found links:', links))
    .catch(error => console.error('Error:', error));

Advanced Examples

Processing Multiple Pages Concurrently

async function scrapeMultiplePages(urls) {
    const promises = urls.map(url => scrapeWebsite(url));

    try {
        const results = await Promise.allSettled(promises);

        return results.map((result, index) => ({
            url: urls[index],
            success: result.status === 'fulfilled',
            data: result.status === 'fulfilled' ? result.value : null,
            error: result.status === 'rejected' ? result.reason.message : null
        }));
    } catch (error) {
        console.error('Batch scraping failed:', error);
        throw error;
    }
}

// Usage
const urls = ['https://site1.com', 'https://site2.com', 'https://site3.com'];
scrapeMultiplePages(urls)
    .then(results => console.log('Batch results:', results));

Complex Data Extraction

async function scrapeArticleData(url) {
    try {
        const response = await axios.get(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible; bot/1.0)'
            }
        });

        const $ = cheerio.load(response.data);

        const article = {
            title: $('h1').first().text().trim(),
            author: $('.author').text().trim(),
            publishDate: $('time').attr('datetime'),
            content: $('.article-content p').map((i, el) => $(el).text().trim()).get(),
            images: $('.article-content img').map((i, el) => ({
                src: $(el).attr('src'),
                alt: $(el).attr('alt')
            })).get(),
            tags: $('.tag').map((i, el) => $(el).text().trim()).get()
        };

        return article;
    } catch (error) {
        console.error('Article scraping failed:', error);
        throw error;
    }
}

With Rate Limiting and Retry Logic

const delay = ms => new Promise(resolve => setTimeout(resolve, ms));

async function scrapeWithRetry(url, maxRetries = 3, delayMs = 1000) {
    for (let attempt = 1; attempt <= maxRetries; attempt++) {
        try {
            // Add delay to respect rate limits
            if (attempt > 1) {
                await delay(delayMs * attempt);
            }

            const response = await axios.get(url, {
                timeout: 10000,
                headers: {
                    'User-Agent': 'Mozilla/5.0 (compatible; scraper/1.0)'
                }
            });

            const $ = cheerio.load(response.data);

            // Your scraping logic here
            const data = extractData($);
            return data;

        } catch (error) {
            console.log(`Attempt ${attempt} failed for ${url}:`, error.message);

            if (attempt === maxRetries) {
                throw new Error(`Failed after ${maxRetries} attempts: ${error.message}`);
            }
        }
    }
}

function extractData($) {
    return {
        title: $('title').text(),
        headings: $('h1, h2, h3').map((i, el) => $(el).text()).get(),
        paragraphs: $('p').length
    };
}

Error Handling Best Practices

async function robustScraper(url) {
    try {
        const response = await axios.get(url, {
            timeout: 15000,
            validateStatus: status => status < 400
        });

        if (!response.data) {
            throw new Error('Empty response received');
        }

        const $ = cheerio.load(response.data);

        // Validate that expected elements exist
        if ($('title').length === 0) {
            console.warn('No title found - possible parsing issue');
        }

        return extractData($);

    } catch (error) {
        // Handle different types of errors
        if (error.code === 'ECONNABORTED') {
            throw new Error('Request timeout - try again later');
        } else if (error.response?.status === 404) {
            throw new Error('Page not found');
        } else if (error.response?.status === 403) {
            throw new Error('Access forbidden - check user agent or IP');
        } else {
            throw new Error(`Scraping failed: ${error.message}`);
        }
    }
}

Using with Different HTTP Clients

With Fetch API (Node.js 18+)

async function scrapeWithFetch(url) {
    try {
        const response = await fetch(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible; scraper/1.0)'
            }
        });

        if (!response.ok) {
            throw new Error(`HTTP ${response.status}: ${response.statusText}`);
        }

        const html = await response.text();
        const $ = cheerio.load(html);

        return extractData($);
    } catch (error) {
        console.error('Fetch scraping failed:', error);
        throw error;
    }
}

With got Library

const got = require('got');

async function scrapeWithGot(url) {
    try {
        const response = await got(url, {
            timeout: { request: 10000 },
            retry: { limit: 2 }
        });

        const $ = cheerio.load(response.body);
        return extractData($);
    } catch (error) {
        console.error('Got scraping failed:', error);
        throw error;
    }
}

Key Benefits of Async Cheerio

  • Non-blocking: Allows concurrent processing of multiple pages
  • Error isolation: Failed requests don't stop other operations
  • Memory efficient: Processes one page at a time in sequence
  • Scalable: Easy to implement rate limiting and retry logic

Best Practices

  1. Always use try-catch blocks for error handling
  2. Set appropriate timeouts to prevent hanging requests
  3. Implement rate limiting to avoid being blocked
  4. Use proper User-Agent headers to appear legitimate
  5. Handle different HTTP status codes appropriately
  6. Validate extracted data before processing
  7. Use Promise.allSettled() for batch operations to handle partial failures

This approach makes Cheerio a powerful tool for asynchronous web scraping workflows while maintaining clean, readable code.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon