Cheerio is a fast, flexible, and lean implementation of core jQuery designed specifically for the server in Node.js. Scraping table data with Cheerio is straightforward and efficient, making it ideal for extracting structured data from HTML tables.
Prerequisites
Install the necessary packages:
# Modern approach with axios (recommended)
npm install cheerio axios
# Alternative with node-fetch
npm install cheerio node-fetch
Basic Table Scraping
Simple Table Example
Here's a complete example that scrapes a basic HTML table:
const axios = require('axios');
const cheerio = require('cheerio');
async function scrapeTable(url) {
try {
// Fetch the HTML content
const { data } = await axios.get(url);
// Load HTML into Cheerio
const $ = cheerio.load(data);
// Extract table data
const tableData = [];
$('table tr').each((index, element) => {
const row = [];
$(element).find('td, th').each((i, cell) => {
row.push($(cell).text().trim());
});
if (row.length > 0) {
tableData.push(row);
}
});
return tableData;
} catch (error) {
console.error('Error scraping table:', error.message);
throw error;
}
}
// Usage
scrapeTable('https://example.com/table-page')
.then(data => console.log(data))
.catch(err => console.error(err));
Advanced Table Parsing
For more complex tables with headers and structured data:
const axios = require('axios');
const cheerio = require('cheerio');
async function scrapeTableWithHeaders(url, tableSelector = 'table') {
try {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
const table = $(tableSelector).first();
// Extract headers
const headers = [];
table.find('thead th, tr:first-child th').each((i, element) => {
headers.push($(element).text().trim());
});
// Extract rows
const rows = [];
table.find('tbody tr, tr:not(:first-child)').each((i, element) => {
const row = {};
$(element).find('td').each((j, cell) => {
const headerName = headers[j] || `column_${j}`;
row[headerName] = $(cell).text().trim();
});
// Only add rows with data
if (Object.keys(row).length > 0) {
rows.push(row);
}
});
return {
headers,
rows,
count: rows.length
};
} catch (error) {
console.error('Error scraping table:', error.message);
throw error;
}
}
// Usage
scrapeTableWithHeaders('https://example.com/data-table')
.then(result => {
console.log('Headers:', result.headers);
console.log('Data:', result.rows);
console.log('Total rows:', result.count);
});
Handling Different Table Structures
Tables Without thead/tbody
function scrapeSimpleTable($, tableSelector = 'table') {
const rows = [];
$(tableSelector).find('tr').each((index, row) => {
const cells = [];
$(row).find('td, th').each((i, cell) => {
cells.push($(cell).text().trim());
});
if (cells.length > 0) {
rows.push(cells);
}
});
return rows;
}
Tables with Specific Classes or IDs
function scrapeSpecificTable($, options = {}) {
const {
tableId,
tableClass,
skipFirstRow = false,
extractAttributes = false
} = options;
let selector = 'table';
if (tableId) selector = `#${tableId}`;
if (tableClass) selector = `.${tableClass}`;
const data = [];
$(selector).find('tr').each((index, row) => {
if (skipFirstRow && index === 0) return;
const rowData = [];
$(row).find('td, th').each((i, cell) => {
const cellData = {
text: $(cell).text().trim()
};
if (extractAttributes) {
cellData.attributes = $(cell).attr();
}
rowData.push(cellData);
});
if (rowData.length > 0) {
data.push(rowData);
}
});
return data;
}
Error Handling and Validation
const axios = require('axios');
const cheerio = require('cheerio');
async function robustTableScraper(url, options = {}) {
const {
timeout = 10000,
retries = 3,
tableSelector = 'table',
validateTable = true
} = options;
for (let attempt = 1; attempt <= retries; attempt++) {
try {
const { data } = await axios.get(url, {
timeout,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; TableScraper/1.0)'
}
});
const $ = cheerio.load(data);
// Validate table exists
if (validateTable && $(tableSelector).length === 0) {
throw new Error(`No table found with selector: ${tableSelector}`);
}
const tableData = [];
$(tableSelector).first().find('tr').each((index, row) => {
const cells = [];
$(row).find('td, th').each((i, cell) => {
const text = $(cell).text().trim();
const html = $(cell).html();
cells.push({
text,
html,
colspan: $(cell).attr('colspan') || 1,
rowspan: $(cell).attr('rowspan') || 1
});
});
if (cells.length > 0) {
tableData.push({
rowIndex: index,
cells
});
}
});
return tableData;
} catch (error) {
console.warn(`Attempt ${attempt} failed:`, error.message);
if (attempt === retries) {
throw new Error(`Failed to scrape table after ${retries} attempts: ${error.message}`);
}
// Wait before retry
await new Promise(resolve => setTimeout(resolve, 1000 * attempt));
}
}
}
Working with Complex Tables
Nested Tables
function scrapeNestedTables($) {
const mainTables = [];
$('table').each((index, table) => {
const $table = $(table);
// Skip tables inside other tables for main extraction
if ($table.parents('table').length > 0) return;
const tableData = {
index,
rows: [],
nestedTables: []
};
$table.find('tr').each((rowIndex, row) => {
const $row = $(row);
const cells = [];
$row.find('td, th').each((cellIndex, cell) => {
const $cell = $(cell);
const nestedTable = $cell.find('table');
const cellData = {
text: $cell.clone().children('table').remove().end().text().trim(),
hasNestedTable: nestedTable.length > 0
};
if (nestedTable.length > 0) {
cellData.nestedTableData = scrapeSimpleTable($, nestedTable);
}
cells.push(cellData);
});
tableData.rows.push(cells);
});
mainTables.push(tableData);
});
return mainTables;
}
Complete Working Example
const axios = require('axios');
const cheerio = require('cheerio');
class TableScraper {
constructor(options = {}) {
this.timeout = options.timeout || 10000;
this.userAgent = options.userAgent || 'Mozilla/5.0 (compatible; TableScraper/1.0)';
}
async scrape(url, tableOptions = {}) {
try {
const { data } = await axios.get(url, {
timeout: this.timeout,
headers: { 'User-Agent': this.userAgent }
});
const $ = cheerio.load(data);
return this.parseTable($, tableOptions);
} catch (error) {
throw new Error(`Failed to scrape ${url}: ${error.message}`);
}
}
parseTable($, options = {}) {
const {
selector = 'table',
includeHeaders = true,
trimWhitespace = true,
preserveStructure = false
} = options;
const tables = [];
$(selector).each((tableIndex, table) => {
const $table = $(table);
const tableData = {
index: tableIndex,
headers: [],
rows: []
};
// Extract headers if requested
if (includeHeaders) {
$table.find('thead th, tr:first-child th').each((i, header) => {
const text = $(header).text();
tableData.headers.push(trimWhitespace ? text.trim() : text);
});
}
// Extract data rows
const rowSelector = includeHeaders ? 'tbody tr, tr:not(:first-child)' : 'tr';
$table.find(rowSelector).each((rowIndex, row) => {
const rowData = preserveStructure ? {} : [];
$(row).find('td, th').each((cellIndex, cell) => {
const text = $(cell).text();
const cellValue = trimWhitespace ? text.trim() : text;
if (preserveStructure && tableData.headers[cellIndex]) {
rowData[tableData.headers[cellIndex]] = cellValue;
} else {
rowData.push ? rowData.push(cellValue) : (rowData[cellIndex] = cellValue);
}
});
if ((Array.isArray(rowData) && rowData.length > 0) ||
(!Array.isArray(rowData) && Object.keys(rowData).length > 0)) {
tableData.rows.push(rowData);
}
});
tables.push(tableData);
});
return tables.length === 1 ? tables[0] : tables;
}
}
// Usage example
async function main() {
const scraper = new TableScraper();
try {
const result = await scraper.scrape('https://example.com/data-table', {
includeHeaders: true,
preserveStructure: true
});
console.log('Table headers:', result.headers);
console.log('First row:', result.rows[0]);
console.log('Total rows:', result.rows.length);
} catch (error) {
console.error('Scraping failed:', error.message);
}
}
main();
Best Practices
- Use Modern HTTP Clients: Prefer
axios
ornode-fetch
over deprecatedrequest-promise
- Handle Errors Gracefully: Always wrap requests in try-catch blocks
- Validate Selectors: Check if elements exist before processing
- Respect Rate Limits: Add delays between requests when scraping multiple pages
- Clean Data: Trim whitespace and handle empty cells appropriately
- Preserve Structure: Consider returning objects with named properties instead of arrays
Common Issues and Solutions
- Empty Results: Check if the table uses
thead/tbody
or specific classes - Missing Data: Verify the selector matches the actual HTML structure
- Memory Issues: Process large tables in chunks or streams
- Dynamic Content: Use Puppeteer for JavaScript-rendered tables
- Encoding Problems: Specify proper encoding in axios requests
Always test your selectors and validate the scraped data structure before using it in production.