How do you use Cheerio to scrape data from multiple pages?
Scraping data from multiple pages with Cheerio requires careful coordination of HTTP requests, proper queue management, and efficient processing techniques. Unlike single-page scraping, multi-page scraping involves handling pagination, managing request flow, and often implementing parallel processing for better performance.
Understanding Cheerio's Role in Multi-Page Scraping
Cheerio is a server-side implementation of jQuery designed for server-side HTML parsing. When scraping multiple pages, Cheerio works as the HTML parser while you'll need additional libraries like axios
or node-fetch
for making HTTP requests, and potentially async
or p-queue
for managing request flow.
Basic Multi-Page Scraping Setup
First, install the necessary dependencies:
npm install cheerio axios async
Here's a basic structure for multi-page scraping:
const cheerio = require('cheerio');
const axios = require('axios');
const async = require('async');
class MultiPageScraper {
constructor() {
this.results = [];
this.visitedUrls = new Set();
this.requestDelay = 1000; // 1 second delay between requests
}
async scrapePage(url) {
try {
console.log(`Scraping: ${url}`);
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
const $ = cheerio.load(response.data);
return this.extractData($);
} catch (error) {
console.error(`Error scraping ${url}:`, error.message);
return null;
}
}
extractData($) {
// Override this method in specific implementations
return {};
}
async delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
Sequential Page Scraping
Sequential scraping processes pages one at a time, which is more respectful to target servers but slower:
class SequentialScraper extends MultiPageScraper {
async scrapeUrls(urls) {
for (const url of urls) {
if (this.visitedUrls.has(url)) continue;
const data = await this.scrapePage(url);
if (data) {
this.results.push({ url, data });
this.visitedUrls.add(url);
}
await this.delay(this.requestDelay);
}
return this.results;
}
extractData($) {
return {
title: $('h1').first().text().trim(),
description: $('meta[name="description"]').attr('content'),
links: $('a[href]').map((i, el) => $(el).attr('href')).get()
};
}
}
// Usage
async function runSequentialScraper() {
const scraper = new SequentialScraper();
const urls = [
'https://example.com/page1',
'https://example.com/page2',
'https://example.com/page3'
];
const results = await scraper.scrapeUrls(urls);
console.log('Scraping complete:', results);
}
Parallel Page Scraping
Parallel scraping processes multiple pages simultaneously, offering better performance but requiring careful rate limiting:
const { promisify } = require('util');
class ParallelScraper extends MultiPageScraper {
constructor(concurrency = 3) {
super();
this.concurrency = concurrency;
}
async scrapeUrls(urls) {
const queue = async.queue(async (url) => {
if (this.visitedUrls.has(url)) return null;
const data = await this.scrapePage(url);
if (data) {
this.visitedUrls.add(url);
return { url, data };
}
return null;
}, this.concurrency);
// Add delay between queue processing
queue.drain(() => {
console.log('All pages processed');
});
const promises = urls.map(url => {
return new Promise((resolve) => {
queue.push(url, (err, result) => {
if (err) console.error('Queue error:', err);
resolve(result);
});
});
});
const results = await Promise.all(promises);
return results.filter(result => result !== null);
}
}
// Usage with rate limiting
async function runParallelScraper() {
const scraper = new ParallelScraper(2); // Process 2 pages concurrently
const urls = Array.from({length: 10}, (_, i) =>
`https://example.com/page${i + 1}`
);
const results = await scraper.scrapeUrls(urls);
console.log(`Scraped ${results.length} pages successfully`);
}
Handling Pagination
Many websites use pagination, requiring you to discover and navigate through page links:
class PaginationScraper extends MultiPageScraper {
async scrapeWithPagination(startUrl, maxPages = 50) {
let currentUrl = startUrl;
let pageCount = 0;
while (currentUrl && pageCount < maxPages) {
console.log(`Processing page ${pageCount + 1}: ${currentUrl}`);
try {
const response = await axios.get(currentUrl);
const $ = cheerio.load(response.data);
// Extract data from current page
const pageData = this.extractData($);
this.results.push({
page: pageCount + 1,
url: currentUrl,
data: pageData
});
// Find next page URL
currentUrl = this.findNextPageUrl($, currentUrl);
pageCount++;
await this.delay(this.requestDelay);
} catch (error) {
console.error(`Error on page ${pageCount + 1}:`, error.message);
break;
}
}
return this.results;
}
findNextPageUrl($, currentUrl) {
// Method 1: Look for "Next" button or link
const nextLink = $('a:contains("Next"), a:contains("→"), .next-page').first();
if (nextLink.length) {
const href = nextLink.attr('href');
return this.resolveUrl(href, currentUrl);
}
// Method 2: Look for numbered pagination
const currentPageNum = this.extractPageNumber(currentUrl);
if (currentPageNum) {
const nextPageUrl = currentUrl.replace(/page[=\/](\d+)/, `page=${currentPageNum + 1}`);
// Verify next page exists
const nextPageLink = $(`a[href*="page=${currentPageNum + 1}"]`);
return nextPageLink.length ? nextPageUrl : null;
}
return null;
}
extractPageNumber(url) {
const match = url.match(/page[=\/](\d+)/);
return match ? parseInt(match[1]) : null;
}
resolveUrl(href, baseUrl) {
if (href.startsWith('http')) return href;
if (href.startsWith('/')) return new URL(href, baseUrl).href;
return new URL(href, baseUrl).href;
}
extractData($) {
return {
articles: $('.article').map((i, el) => ({
title: $(el).find('.title').text().trim(),
link: $(el).find('a').attr('href'),
summary: $(el).find('.summary').text().trim()
})).get()
};
}
}
Advanced Multi-Page Scraping with Queue Management
For large-scale scraping operations, implement a more sophisticated queue system:
const EventEmitter = require('events');
class AdvancedMultiPageScraper extends EventEmitter {
constructor(options = {}) {
super();
this.concurrency = options.concurrency || 3;
this.retryAttempts = options.retryAttempts || 3;
this.retryDelay = options.retryDelay || 2000;
this.requestDelay = options.requestDelay || 1000;
this.queue = [];
this.processing = new Set();
this.completed = new Set();
this.failed = new Set();
this.results = [];
this.running = false;
}
addUrls(urls) {
const newUrls = urls.filter(url =>
!this.completed.has(url) &&
!this.processing.has(url) &&
!this.queue.includes(url)
);
this.queue.push(...newUrls);
this.emit('urlsAdded', newUrls.length);
if (!this.running) this.start();
}
async start() {
this.running = true;
this.emit('started');
const workers = Array(this.concurrency).fill().map(() => this.worker());
await Promise.all(workers);
this.running = false;
this.emit('completed', {
successful: this.completed.size,
failed: this.failed.size,
results: this.results
});
}
async worker() {
while (this.running && (this.queue.length > 0 || this.processing.size > 0)) {
if (this.queue.length === 0) {
await this.delay(100);
continue;
}
const url = this.queue.shift();
if (!url) continue;
this.processing.add(url);
try {
const result = await this.scrapeWithRetry(url);
if (result) {
this.results.push(result);
this.emit('pageScraped', result);
}
this.completed.add(url);
} catch (error) {
this.failed.add(url);
this.emit('pageFailed', { url, error: error.message });
} finally {
this.processing.delete(url);
await this.delay(this.requestDelay);
}
}
}
async scrapeWithRetry(url, attempt = 1) {
try {
const response = await axios.get(url, {
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)'
}
});
const $ = cheerio.load(response.data);
// Extract data and discover new URLs
const data = this.extractData($);
const newUrls = this.discoverUrls($, url);
if (newUrls.length > 0) {
this.addUrls(newUrls);
}
return { url, data, timestamp: new Date().toISOString() };
} catch (error) {
if (attempt < this.retryAttempts) {
await this.delay(this.retryDelay * attempt);
return this.scrapeWithRetry(url, attempt + 1);
}
throw error;
}
}
extractData($) {
// Override in subclass
return {
title: $('title').text(),
headings: $('h1, h2, h3').map((i, el) => $(el).text()).get()
};
}
discoverUrls($, baseUrl) {
// Override in subclass to discover new URLs
return [];
}
async delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage example
const scraper = new AdvancedMultiPageScraper({
concurrency: 5,
requestDelay: 500,
retryAttempts: 3
});
scraper.on('pageScraped', (result) => {
console.log(`✓ Scraped: ${result.url}`);
});
scraper.on('pageFailed', ({ url, error }) => {
console.log(`✗ Failed: ${url} - ${error}`);
});
scraper.on('completed', (stats) => {
console.log(`Scraping completed: ${stats.successful} successful, ${stats.failed} failed`);
});
// Start scraping
scraper.addUrls(['https://example.com']);
Best Practices for Multi-Page Scraping
1. Implement Proper Rate Limiting
class RateLimitedScraper {
constructor() {
this.lastRequestTime = 0;
this.minDelay = 1000; // Minimum 1 second between requests
}
async respectfulRequest(url) {
const timeSinceLastRequest = Date.now() - this.lastRequestTime;
if (timeSinceLastRequest < this.minDelay) {
await this.delay(this.minDelay - timeSinceLastRequest);
}
this.lastRequestTime = Date.now();
return axios.get(url);
}
async delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
2. Handle Different Content Types
When scraping multiple pages, you might encounter different content structures. For complex scenarios involving JavaScript-heavy sites, consider combining Cheerio with headless browsers like Puppeteer for handling dynamic content that loads after page load.
3. Error Handling and Recovery
class RobustScraper extends MultiPageScraper {
async scrapeWithErrorHandling(url) {
const maxRetries = 3;
let lastError;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const response = await axios.get(url, {
timeout: 15000,
validateStatus: status => status < 500 // Retry on 5xx errors
});
if (response.status === 404) {
console.log(`Page not found: ${url}`);
return null;
}
return cheerio.load(response.data);
} catch (error) {
lastError = error;
console.log(`Attempt ${attempt} failed for ${url}: ${error.message}`);
if (attempt < maxRetries) {
await this.delay(attempt * 2000); // Exponential backoff
}
}
}
throw new Error(`Failed to scrape ${url} after ${maxRetries} attempts: ${lastError.message}`);
}
}
Performance Optimization Tips
- Use Connection Pooling: Configure axios with connection pooling for better performance
- Implement Caching: Cache responses to avoid re-scraping unchanged content
- Monitor Memory Usage: For large-scale operations, implement periodic garbage collection
- Use Streaming: For very large datasets, consider streaming results to files
// Connection pooling example
const http = require('http');
const https = require('https');
const axiosConfig = {
httpAgent: new http.Agent({ keepAlive: true, maxSockets: 5 }),
httpsAgent: new https.Agent({ keepAlive: true, maxSockets: 5 }),
timeout: 10000
};
const axiosInstance = axios.create(axiosConfig);
Conclusion
Multi-page scraping with Cheerio requires careful orchestration of HTTP requests, HTML parsing, and data management. Whether you're implementing sequential processing for respectful scraping or parallel processing for performance, the key is balancing efficiency with responsible scraping practices. For scenarios requiring interaction with JavaScript-heavy sites, consider integrating with tools like Puppeteer for running multiple pages in parallel while maintaining the parsing capabilities of Cheerio.
Remember to always respect robots.txt files, implement appropriate delays between requests, and handle errors gracefully to build robust and maintainable scraping solutions.