How do you use Cheerio to scrape data from a table?

Cheerio is a fast, flexible, and lean implementation of core jQuery designed specifically for the server in Node.js. Scraping table data with Cheerio is straightforward and efficient, making it ideal for extracting structured data from HTML tables.

Prerequisites

Install the necessary packages:

# Modern approach with axios (recommended)
npm install cheerio axios

# Alternative with node-fetch
npm install cheerio node-fetch

Basic Table Scraping

Simple Table Example

Here's a complete example that scrapes a basic HTML table:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeTable(url) {
  try {
    // Fetch the HTML content
    const { data } = await axios.get(url);

    // Load HTML into Cheerio
    const $ = cheerio.load(data);

    // Extract table data
    const tableData = [];

    $('table tr').each((index, element) => {
      const row = [];
      $(element).find('td, th').each((i, cell) => {
        row.push($(cell).text().trim());
      });
      if (row.length > 0) {
        tableData.push(row);
      }
    });

    return tableData;
  } catch (error) {
    console.error('Error scraping table:', error.message);
    throw error;
  }
}

// Usage
scrapeTable('https://example.com/table-page')
  .then(data => console.log(data))
  .catch(err => console.error(err));

Advanced Table Parsing

For more complex tables with headers and structured data:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeTableWithHeaders(url, tableSelector = 'table') {
  try {
    const { data } = await axios.get(url);
    const $ = cheerio.load(data);

    const table = $(tableSelector).first();

    // Extract headers
    const headers = [];
    table.find('thead th, tr:first-child th').each((i, element) => {
      headers.push($(element).text().trim());
    });

    // Extract rows
    const rows = [];
    table.find('tbody tr, tr:not(:first-child)').each((i, element) => {
      const row = {};
      $(element).find('td').each((j, cell) => {
        const headerName = headers[j] || `column_${j}`;
        row[headerName] = $(cell).text().trim();
      });

      // Only add rows with data
      if (Object.keys(row).length > 0) {
        rows.push(row);
      }
    });

    return {
      headers,
      rows,
      count: rows.length
    };
  } catch (error) {
    console.error('Error scraping table:', error.message);
    throw error;
  }
}

// Usage
scrapeTableWithHeaders('https://example.com/data-table')
  .then(result => {
    console.log('Headers:', result.headers);
    console.log('Data:', result.rows);
    console.log('Total rows:', result.count);
  });

Handling Different Table Structures

Tables Without thead/tbody

function scrapeSimpleTable($, tableSelector = 'table') {
  const rows = [];

  $(tableSelector).find('tr').each((index, row) => {
    const cells = [];
    $(row).find('td, th').each((i, cell) => {
      cells.push($(cell).text().trim());
    });

    if (cells.length > 0) {
      rows.push(cells);
    }
  });

  return rows;
}

Tables with Specific Classes or IDs

function scrapeSpecificTable($, options = {}) {
  const {
    tableId,
    tableClass,
    skipFirstRow = false,
    extractAttributes = false
  } = options;

  let selector = 'table';
  if (tableId) selector = `#${tableId}`;
  if (tableClass) selector = `.${tableClass}`;

  const data = [];

  $(selector).find('tr').each((index, row) => {
    if (skipFirstRow && index === 0) return;

    const rowData = [];
    $(row).find('td, th').each((i, cell) => {
      const cellData = {
        text: $(cell).text().trim()
      };

      if (extractAttributes) {
        cellData.attributes = $(cell).attr();
      }

      rowData.push(cellData);
    });

    if (rowData.length > 0) {
      data.push(rowData);
    }
  });

  return data;
}

Error Handling and Validation

const axios = require('axios');
const cheerio = require('cheerio');

async function robustTableScraper(url, options = {}) {
  const {
    timeout = 10000,
    retries = 3,
    tableSelector = 'table',
    validateTable = true
  } = options;

  for (let attempt = 1; attempt <= retries; attempt++) {
    try {
      const { data } = await axios.get(url, { 
        timeout,
        headers: {
          'User-Agent': 'Mozilla/5.0 (compatible; TableScraper/1.0)'
        }
      });

      const $ = cheerio.load(data);

      // Validate table exists
      if (validateTable && $(tableSelector).length === 0) {
        throw new Error(`No table found with selector: ${tableSelector}`);
      }

      const tableData = [];

      $(tableSelector).first().find('tr').each((index, row) => {
        const cells = [];
        $(row).find('td, th').each((i, cell) => {
          const text = $(cell).text().trim();
          const html = $(cell).html();

          cells.push({
            text,
            html,
            colspan: $(cell).attr('colspan') || 1,
            rowspan: $(cell).attr('rowspan') || 1
          });
        });

        if (cells.length > 0) {
          tableData.push({
            rowIndex: index,
            cells
          });
        }
      });

      return tableData;

    } catch (error) {
      console.warn(`Attempt ${attempt} failed:`, error.message);

      if (attempt === retries) {
        throw new Error(`Failed to scrape table after ${retries} attempts: ${error.message}`);
      }

      // Wait before retry
      await new Promise(resolve => setTimeout(resolve, 1000 * attempt));
    }
  }
}

Working with Complex Tables

Nested Tables

function scrapeNestedTables($) {
  const mainTables = [];

  $('table').each((index, table) => {
    const $table = $(table);

    // Skip tables inside other tables for main extraction
    if ($table.parents('table').length > 0) return;

    const tableData = {
      index,
      rows: [],
      nestedTables: []
    };

    $table.find('tr').each((rowIndex, row) => {
      const $row = $(row);
      const cells = [];

      $row.find('td, th').each((cellIndex, cell) => {
        const $cell = $(cell);
        const nestedTable = $cell.find('table');

        const cellData = {
          text: $cell.clone().children('table').remove().end().text().trim(),
          hasNestedTable: nestedTable.length > 0
        };

        if (nestedTable.length > 0) {
          cellData.nestedTableData = scrapeSimpleTable($, nestedTable);
        }

        cells.push(cellData);
      });

      tableData.rows.push(cells);
    });

    mainTables.push(tableData);
  });

  return mainTables;
}

Complete Working Example

const axios = require('axios');
const cheerio = require('cheerio');

class TableScraper {
  constructor(options = {}) {
    this.timeout = options.timeout || 10000;
    this.userAgent = options.userAgent || 'Mozilla/5.0 (compatible; TableScraper/1.0)';
  }

  async scrape(url, tableOptions = {}) {
    try {
      const { data } = await axios.get(url, {
        timeout: this.timeout,
        headers: { 'User-Agent': this.userAgent }
      });

      const $ = cheerio.load(data);
      return this.parseTable($, tableOptions);

    } catch (error) {
      throw new Error(`Failed to scrape ${url}: ${error.message}`);
    }
  }

  parseTable($, options = {}) {
    const {
      selector = 'table',
      includeHeaders = true,
      trimWhitespace = true,
      preserveStructure = false
    } = options;

    const tables = [];

    $(selector).each((tableIndex, table) => {
      const $table = $(table);
      const tableData = {
        index: tableIndex,
        headers: [],
        rows: []
      };

      // Extract headers if requested
      if (includeHeaders) {
        $table.find('thead th, tr:first-child th').each((i, header) => {
          const text = $(header).text();
          tableData.headers.push(trimWhitespace ? text.trim() : text);
        });
      }

      // Extract data rows
      const rowSelector = includeHeaders ? 'tbody tr, tr:not(:first-child)' : 'tr';
      $table.find(rowSelector).each((rowIndex, row) => {
        const rowData = preserveStructure ? {} : [];

        $(row).find('td, th').each((cellIndex, cell) => {
          const text = $(cell).text();
          const cellValue = trimWhitespace ? text.trim() : text;

          if (preserveStructure && tableData.headers[cellIndex]) {
            rowData[tableData.headers[cellIndex]] = cellValue;
          } else {
            rowData.push ? rowData.push(cellValue) : (rowData[cellIndex] = cellValue);
          }
        });

        if ((Array.isArray(rowData) && rowData.length > 0) || 
            (!Array.isArray(rowData) && Object.keys(rowData).length > 0)) {
          tableData.rows.push(rowData);
        }
      });

      tables.push(tableData);
    });

    return tables.length === 1 ? tables[0] : tables;
  }
}

// Usage example
async function main() {
  const scraper = new TableScraper();

  try {
    const result = await scraper.scrape('https://example.com/data-table', {
      includeHeaders: true,
      preserveStructure: true
    });

    console.log('Table headers:', result.headers);
    console.log('First row:', result.rows[0]);
    console.log('Total rows:', result.rows.length);

  } catch (error) {
    console.error('Scraping failed:', error.message);
  }
}

main();

Best Practices

Use Modern HTTP Clients: Prefer axios or node-fetch over deprecated request-promise
Handle Errors Gracefully: Always wrap requests in try-catch blocks
Validate Selectors: Check if elements exist before processing
Respect Rate Limits: Add delays between requests when scraping multiple pages
Clean Data: Trim whitespace and handle empty cells appropriately
Preserve Structure: Consider returning objects with named properties instead of arrays

Common Issues and Solutions

Empty Results: Check if the table uses thead/tbody or specific classes
Missing Data: Verify the selector matches the actual HTML structure
Memory Issues: Process large tables in chunks or streams
Dynamic Content: Use Puppeteer for JavaScript-rendered tables
Encoding Problems: Specify proper encoding in axios requests

Always test your selectors and validate the scraped data structure before using it in production.

Table of contents

How do you use Cheerio to scrape data from a table?

Prerequisites

Basic Table Scraping

Simple Table Example

Advanced Table Parsing

Handling Different Table Structures

Tables Without thead/tbody

Tables with Specific Classes or IDs

Error Handling and Validation

Working with Complex Tables

Nested Tables

Complete Working Example

Best Practices

Common Issues and Solutions

Try WebScraping.AI for Your Web Scraping Needs

Key Features:

Getting Started:

Related Questions

How do you deal with errors when loading HTML in Cheerio?

How do you remove elements from the DOM using Cheerio?

How do you get the inner HTML content of an element using Cheerio?

Get Started Now