Cheerio is a fast, flexible implementation of jQuery designed for server-side HTML parsing and manipulation. However, Cheerio itself doesn't handle HTTP requests, cookies, or sessions—it only parses and manipulates HTML. To manage cookies and sessions when web scraping with Cheerio, you need to pair it with an HTTP client library like axios
, got
, or node-fetch
.
Understanding the Architecture
Since Cheerio only handles HTML parsing, cookie and session management requires:
- HTTP Client: To make requests and handle cookies (e.g., axios, got)
- Cookie Store: To persist cookies between requests (e.g., tough-cookie)
- Cheerio: To parse and extract data from HTML responses
Method 1: Using Axios with tough-cookie
Installation
npm install cheerio axios tough-cookie axios-cookiejar-support
Basic Setup
const axios = require('axios');
const cheerio = require('cheerio');
const { CookieJar } = require('tough-cookie');
const axiosCookieJarSupport = require('axios-cookiejar-support').default;
// Add cookie jar support to axios
axiosCookieJarSupport(axios);
// Create a cookie jar to store cookies
const cookieJar = new CookieJar();
// Create axios instance with cookie support
const httpClient = axios.create({
jar: cookieJar,
withCredentials: true,
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
Complete Scraping Example
async function scrapeWithSession() {
try {
// Step 1: Login to establish session
const loginData = {
username: 'your-username',
password: 'your-password'
};
const loginResponse = await httpClient.post('https://example.com/login', loginData);
console.log('Login status:', loginResponse.status);
// Step 2: Access protected content (cookies automatically sent)
const protectedResponse = await httpClient.get('https://example.com/protected-page');
// Step 3: Parse with Cheerio
const $ = cheerio.load(protectedResponse.data);
// Extract data
const userData = [];
$('.user-item').each((index, element) => {
userData.push({
name: $(element).find('.name').text().trim(),
email: $(element).find('.email').text().trim(),
status: $(element).find('.status').text().trim()
});
});
console.log('Extracted user data:', userData);
return userData;
} catch (error) {
console.error('Scraping failed:', error.message);
throw error;
}
}
// Execute the scraping
scrapeWithSession();
Method 2: Manual Cookie Handling with Axios
For more control over cookie management:
const axios = require('axios');
const cheerio = require('cheerio');
class WebScraper {
constructor() {
this.cookies = new Map();
this.httpClient = axios.create({
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Web Scraper)'
}
});
// Add request interceptor to attach cookies
this.httpClient.interceptors.request.use(config => {
const cookieString = this.getCookieString(config.url);
if (cookieString) {
config.headers.Cookie = cookieString;
}
return config;
});
// Add response interceptor to extract cookies
this.httpClient.interceptors.response.use(response => {
this.extractCookies(response);
return response;
});
}
extractCookies(response) {
const setCookieHeaders = response.headers['set-cookie'];
if (setCookieHeaders) {
setCookieHeaders.forEach(cookieString => {
const [nameValue] = cookieString.split(';');
const [name, value] = nameValue.split('=');
this.cookies.set(name.trim(), value.trim());
});
}
}
getCookieString(url) {
const cookieArray = Array.from(this.cookies.entries())
.map(([name, value]) => `${name}=${value}`);
return cookieArray.length > 0 ? cookieArray.join('; ') : null;
}
async scrape(url) {
try {
const response = await this.httpClient.get(url);
const $ = cheerio.load(response.data);
return $;
} catch (error) {
console.error(`Failed to scrape ${url}:`, error.message);
throw error;
}
}
}
// Usage example
async function main() {
const scraper = new WebScraper();
// First request establishes cookies
let $ = await scraper.scrape('https://example.com/login-page');
// Subsequent requests automatically include cookies
$ = await scraper.scrape('https://example.com/dashboard');
// Extract data
const titles = [];
$('h2.title').each((i, elem) => {
titles.push($(elem).text().trim());
});
console.log('Page titles:', titles);
}
main().catch(console.error);
Method 3: Using got with Built-in Cookie Support
got
has excellent built-in cookie support:
const got = require('got');
const cheerio = require('cheerio');
const { CookieJar } = require('tough-cookie');
const cookieJar = new CookieJar();
const httpClient = got.extend({
cookieJar,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Web Scraper)'
}
});
async function scrapeWithGot() {
try {
// Login request
await httpClient.post('https://example.com/login', {
form: {
username: 'your-username',
password: 'your-password'
}
});
// Access protected content
const response = await httpClient.get('https://example.com/protected');
const $ = cheerio.load(response.body);
// Extract data
const results = [];
$('.data-item').each((i, elem) => {
results.push($(elem).text().trim());
});
return results;
} catch (error) {
console.error('Error:', error.message);
throw error;
}
}
Best Practices
1. Respect Rate Limits
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
async function respectfulScraping(urls) {
const results = [];
for (const url of urls) {
try {
const $ = await scraper.scrape(url);
results.push(extractData($));
// Wait between requests
await delay(1000);
} catch (error) {
console.error(`Failed to scrape ${url}:`, error.message);
}
}
return results;
}
2. Handle Session Expiration
async function scrapeWithRetry(url, maxRetries = 3) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const response = await httpClient.get(url);
// Check if session expired
if (response.data.includes('login required')) {
await login();
continue;
}
return cheerio.load(response.data);
} catch (error) {
if (attempt === maxRetries) throw error;
console.log(`Attempt ${attempt} failed, retrying...`);
await delay(2000 * attempt);
}
}
}
3. Save and Load Cookies
const fs = require('fs').promises;
// Save cookies to file
async function saveCookies(cookieJar, filename) {
const cookies = await cookieJar.serialize();
await fs.writeFile(filename, JSON.stringify(cookies));
}
// Load cookies from file
async function loadCookies(filename) {
try {
const data = await fs.readFile(filename, 'utf8');
const cookieData = JSON.parse(data);
return CookieJar.deserialize(cookieData);
} catch (error) {
return new CookieJar();
}
}
Common Use Cases
E-commerce Scraping
async function scrapeProductData() {
// Login to access member prices
await httpClient.post('https://store.example.com/login', loginData);
const productUrls = ['product1', 'product2', 'product3'];
const products = [];
for (const productUrl of productUrls) {
const response = await httpClient.get(`https://store.example.com/${productUrl}`);
const $ = cheerio.load(response.data);
products.push({
name: $('.product-name').text().trim(),
price: $('.member-price').text().trim(),
availability: $('.stock-status').text().trim()
});
await delay(500);
}
return products;
}
Form-based Authentication
async function handleFormAuth() {
// Get login page to extract CSRF token
const loginPageResponse = await httpClient.get('https://example.com/login');
const $ = cheerio.load(loginPageResponse.data);
const csrfToken = $('input[name="_token"]').val();
// Submit login form
await httpClient.post('https://example.com/login', {
username: 'your-username',
password: 'your-password',
_token: csrfToken
});
// Now access protected resources
const dashboardResponse = await httpClient.get('https://example.com/dashboard');
return cheerio.load(dashboardResponse.data);
}
Error Handling and Debugging
const debug = require('debug')('scraper');
class RobustScraper {
async scrapeWithLogging(url) {
try {
debug(`Requesting: ${url}`);
const response = await this.httpClient.get(url);
debug(`Response status: ${response.status}`);
debug(`Cookies: ${JSON.stringify(this.cookies)}`);
const $ = cheerio.load(response.data);
return $;
} catch (error) {
console.error(`Scraping failed for ${url}:`, {
message: error.message,
status: error.response?.status,
headers: error.response?.headers
});
throw error;
}
}
}
Conclusion
Handling cookies and sessions with Cheerio requires combining it with a capable HTTP client. The axios
+ tough-cookie
combination is the most popular choice, while got
offers excellent built-in cookie support. Choose the method that best fits your project's needs and always implement proper error handling and rate limiting for production use.
Remember to respect website terms of service and implement ethical scraping practices, including appropriate delays between requests and respect for robots.txt files.