How do you use Cheerio with asynchronous JavaScript code?

Cheerio is a fast, flexible server-side implementation of jQuery designed for HTML parsing and manipulation. When working with Cheerio in asynchronous JavaScript environments, you'll typically combine it with HTTP clients to fetch and process web content dynamically.

Installation

First, install the required packages:

npm install cheerio axios

Basic Async/Await Usage

Here's a foundational example of using Cheerio with async/await to scrape web content:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeWebsite(url) {
    try {
        // Fetch HTML content
        const response = await axios.get(url);
        const $ = cheerio.load(response.data);

        // Extract data using jQuery-like selectors
        const links = [];
        $('a').each((index, element) => {
            const link = {
                text: $(element).text().trim(),
                href: $(element).attr('href'),
                title: $(element).attr('title') || null
            };
            if (link.href) links.push(link);
        });

        return links;
    } catch (error) {
        console.error('Scraping failed:', error.message);
        throw error;
    }
}

// Usage
scrapeWebsite('https://example.com')
    .then(links => console.log('Found links:', links))
    .catch(error => console.error('Error:', error));

Advanced Examples

Processing Multiple Pages Concurrently

async function scrapeMultiplePages(urls) {
    const promises = urls.map(url => scrapeWebsite(url));

    try {
        const results = await Promise.allSettled(promises);

        return results.map((result, index) => ({
            url: urls[index],
            success: result.status === 'fulfilled',
            data: result.status === 'fulfilled' ? result.value : null,
            error: result.status === 'rejected' ? result.reason.message : null
        }));
    } catch (error) {
        console.error('Batch scraping failed:', error);
        throw error;
    }
}

// Usage
const urls = ['https://site1.com', 'https://site2.com', 'https://site3.com'];
scrapeMultiplePages(urls)
    .then(results => console.log('Batch results:', results));

Complex Data Extraction

async function scrapeArticleData(url) {
    try {
        const response = await axios.get(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible; bot/1.0)'
            }
        });

        const $ = cheerio.load(response.data);

        const article = {
            title: $('h1').first().text().trim(),
            author: $('.author').text().trim(),
            publishDate: $('time').attr('datetime'),
            content: $('.article-content p').map((i, el) => $(el).text().trim()).get(),
            images: $('.article-content img').map((i, el) => ({
                src: $(el).attr('src'),
                alt: $(el).attr('alt')
            })).get(),
            tags: $('.tag').map((i, el) => $(el).text().trim()).get()
        };

        return article;
    } catch (error) {
        console.error('Article scraping failed:', error);
        throw error;
    }
}

With Rate Limiting and Retry Logic

const delay = ms => new Promise(resolve => setTimeout(resolve, ms));

async function scrapeWithRetry(url, maxRetries = 3, delayMs = 1000) {
    for (let attempt = 1; attempt <= maxRetries; attempt++) {
        try {
            // Add delay to respect rate limits
            if (attempt > 1) {
                await delay(delayMs * attempt);
            }

            const response = await axios.get(url, {
                timeout: 10000,
                headers: {
                    'User-Agent': 'Mozilla/5.0 (compatible; scraper/1.0)'
                }
            });

            const $ = cheerio.load(response.data);

            // Your scraping logic here
            const data = extractData($);
            return data;

        } catch (error) {
            console.log(`Attempt ${attempt} failed for ${url}:`, error.message);

            if (attempt === maxRetries) {
                throw new Error(`Failed after ${maxRetries} attempts: ${error.message}`);
            }
        }
    }
}

function extractData($) {
    return {
        title: $('title').text(),
        headings: $('h1, h2, h3').map((i, el) => $(el).text()).get(),
        paragraphs: $('p').length
    };
}

Error Handling Best Practices

async function robustScraper(url) {
    try {
        const response = await axios.get(url, {
            timeout: 15000,
            validateStatus: status => status < 400
        });

        if (!response.data) {
            throw new Error('Empty response received');
        }

        const $ = cheerio.load(response.data);

        // Validate that expected elements exist
        if ($('title').length === 0) {
            console.warn('No title found - possible parsing issue');
        }

        return extractData($);

    } catch (error) {
        // Handle different types of errors
        if (error.code === 'ECONNABORTED') {
            throw new Error('Request timeout - try again later');
        } else if (error.response?.status === 404) {
            throw new Error('Page not found');
        } else if (error.response?.status === 403) {
            throw new Error('Access forbidden - check user agent or IP');
        } else {
            throw new Error(`Scraping failed: ${error.message}`);
        }
    }
}

Using with Different HTTP Clients

With Fetch API (Node.js 18+)

async function scrapeWithFetch(url) {
    try {
        const response = await fetch(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible; scraper/1.0)'
            }
        });

        if (!response.ok) {
            throw new Error(`HTTP ${response.status}: ${response.statusText}`);
        }

        const html = await response.text();
        const $ = cheerio.load(html);

        return extractData($);
    } catch (error) {
        console.error('Fetch scraping failed:', error);
        throw error;
    }
}

With got Library

const got = require('got');

async function scrapeWithGot(url) {
    try {
        const response = await got(url, {
            timeout: { request: 10000 },
            retry: { limit: 2 }
        });

        const $ = cheerio.load(response.body);
        return extractData($);
    } catch (error) {
        console.error('Got scraping failed:', error);
        throw error;
    }
}

Key Benefits of Async Cheerio

Non-blocking: Allows concurrent processing of multiple pages
Error isolation: Failed requests don't stop other operations
Memory efficient: Processes one page at a time in sequence
Scalable: Easy to implement rate limiting and retry logic

Best Practices

Always use try-catch blocks for error handling
Set appropriate timeouts to prevent hanging requests
Implement rate limiting to avoid being blocked
Use proper User-Agent headers to appear legitimate
Handle different HTTP status codes appropriately
Validate extracted data before processing
Use Promise.allSettled() for batch operations to handle partial failures

This approach makes Cheerio a powerful tool for asynchronous web scraping workflows while maintaining clean, readable code.

Table of contents

How do you use Cheerio with asynchronous JavaScript code?

Installation

Basic Async/Await Usage

Advanced Examples

Processing Multiple Pages Concurrently

Complex Data Extraction

With Rate Limiting and Retry Logic

Error Handling Best Practices

Using with Different HTTP Clients

With Fetch API (Node.js 18+)

With got Library

Key Benefits of Async Cheerio

Best Practices

Try WebScraping.AI for Your Web Scraping Needs

Key Features:

Getting Started:

Related Questions

Can you perform actions like click or hover using Cheerio?

How do you handle cookies and sessions when scraping with Cheerio?

How do you extract all links from a webpage using Cheerio?

Get Started Now