How do you use Cheerio to scrape data from a table?

Cheerio is a fast, flexible, and lean implementation of core jQuery designed specifically for the server in Node.js. Scraping table data with Cheerio is straightforward and efficient, making it ideal for extracting structured data from HTML tables.

Prerequisites

Install the necessary packages:

# Modern approach with axios (recommended)
npm install cheerio axios

# Alternative with node-fetch
npm install cheerio node-fetch

Basic Table Scraping

Simple Table Example

Here's a complete example that scrapes a basic HTML table:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeTable(url) {
  try {
    // Fetch the HTML content
    const { data } = await axios.get(url);

    // Load HTML into Cheerio
    const $ = cheerio.load(data);

    // Extract table data
    const tableData = [];

    $('table tr').each((index, element) => {
      const row = [];
      $(element).find('td, th').each((i, cell) => {
        row.push($(cell).text().trim());
      });
      if (row.length > 0) {
        tableData.push(row);
      }
    });

    return tableData;
  } catch (error) {
    console.error('Error scraping table:', error.message);
    throw error;
  }
}

// Usage
scrapeTable('https://example.com/table-page')
  .then(data => console.log(data))
  .catch(err => console.error(err));

Advanced Table Parsing

For more complex tables with headers and structured data:

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeTableWithHeaders(url, tableSelector = 'table') {
  try {
    const { data } = await axios.get(url);
    const $ = cheerio.load(data);

    const table = $(tableSelector).first();

    // Extract headers
    const headers = [];
    table.find('thead th, tr:first-child th').each((i, element) => {
      headers.push($(element).text().trim());
    });

    // Extract rows
    const rows = [];
    table.find('tbody tr, tr:not(:first-child)').each((i, element) => {
      const row = {};
      $(element).find('td').each((j, cell) => {
        const headerName = headers[j] || `column_${j}`;
        row[headerName] = $(cell).text().trim();
      });

      // Only add rows with data
      if (Object.keys(row).length > 0) {
        rows.push(row);
      }
    });

    return {
      headers,
      rows,
      count: rows.length
    };
  } catch (error) {
    console.error('Error scraping table:', error.message);
    throw error;
  }
}

// Usage
scrapeTableWithHeaders('https://example.com/data-table')
  .then(result => {
    console.log('Headers:', result.headers);
    console.log('Data:', result.rows);
    console.log('Total rows:', result.count);
  });

Handling Different Table Structures

Tables Without thead/tbody

function scrapeSimpleTable($, tableSelector = 'table') {
  const rows = [];

  $(tableSelector).find('tr').each((index, row) => {
    const cells = [];
    $(row).find('td, th').each((i, cell) => {
      cells.push($(cell).text().trim());
    });

    if (cells.length > 0) {
      rows.push(cells);
    }
  });

  return rows;
}

Tables with Specific Classes or IDs

function scrapeSpecificTable($, options = {}) {
  const {
    tableId,
    tableClass,
    skipFirstRow = false,
    extractAttributes = false
  } = options;

  let selector = 'table';
  if (tableId) selector = `#${tableId}`;
  if (tableClass) selector = `.${tableClass}`;

  const data = [];

  $(selector).find('tr').each((index, row) => {
    if (skipFirstRow && index === 0) return;

    const rowData = [];
    $(row).find('td, th').each((i, cell) => {
      const cellData = {
        text: $(cell).text().trim()
      };

      if (extractAttributes) {
        cellData.attributes = $(cell).attr();
      }

      rowData.push(cellData);
    });

    if (rowData.length > 0) {
      data.push(rowData);
    }
  });

  return data;
}

Error Handling and Validation

const axios = require('axios');
const cheerio = require('cheerio');

async function robustTableScraper(url, options = {}) {
  const {
    timeout = 10000,
    retries = 3,
    tableSelector = 'table',
    validateTable = true
  } = options;

  for (let attempt = 1; attempt <= retries; attempt++) {
    try {
      const { data } = await axios.get(url, { 
        timeout,
        headers: {
          'User-Agent': 'Mozilla/5.0 (compatible; TableScraper/1.0)'
        }
      });

      const $ = cheerio.load(data);

      // Validate table exists
      if (validateTable && $(tableSelector).length === 0) {
        throw new Error(`No table found with selector: ${tableSelector}`);
      }

      const tableData = [];

      $(tableSelector).first().find('tr').each((index, row) => {
        const cells = [];
        $(row).find('td, th').each((i, cell) => {
          const text = $(cell).text().trim();
          const html = $(cell).html();

          cells.push({
            text,
            html,
            colspan: $(cell).attr('colspan') || 1,
            rowspan: $(cell).attr('rowspan') || 1
          });
        });

        if (cells.length > 0) {
          tableData.push({
            rowIndex: index,
            cells
          });
        }
      });

      return tableData;

    } catch (error) {
      console.warn(`Attempt ${attempt} failed:`, error.message);

      if (attempt === retries) {
        throw new Error(`Failed to scrape table after ${retries} attempts: ${error.message}`);
      }

      // Wait before retry
      await new Promise(resolve => setTimeout(resolve, 1000 * attempt));
    }
  }
}

Working with Complex Tables

Nested Tables

function scrapeNestedTables($) {
  const mainTables = [];

  $('table').each((index, table) => {
    const $table = $(table);

    // Skip tables inside other tables for main extraction
    if ($table.parents('table').length > 0) return;

    const tableData = {
      index,
      rows: [],
      nestedTables: []
    };

    $table.find('tr').each((rowIndex, row) => {
      const $row = $(row);
      const cells = [];

      $row.find('td, th').each((cellIndex, cell) => {
        const $cell = $(cell);
        const nestedTable = $cell.find('table');

        const cellData = {
          text: $cell.clone().children('table').remove().end().text().trim(),
          hasNestedTable: nestedTable.length > 0
        };

        if (nestedTable.length > 0) {
          cellData.nestedTableData = scrapeSimpleTable($, nestedTable);
        }

        cells.push(cellData);
      });

      tableData.rows.push(cells);
    });

    mainTables.push(tableData);
  });

  return mainTables;
}

Complete Working Example

const axios = require('axios');
const cheerio = require('cheerio');

class TableScraper {
  constructor(options = {}) {
    this.timeout = options.timeout || 10000;
    this.userAgent = options.userAgent || 'Mozilla/5.0 (compatible; TableScraper/1.0)';
  }

  async scrape(url, tableOptions = {}) {
    try {
      const { data } = await axios.get(url, {
        timeout: this.timeout,
        headers: { 'User-Agent': this.userAgent }
      });

      const $ = cheerio.load(data);
      return this.parseTable($, tableOptions);

    } catch (error) {
      throw new Error(`Failed to scrape ${url}: ${error.message}`);
    }
  }

  parseTable($, options = {}) {
    const {
      selector = 'table',
      includeHeaders = true,
      trimWhitespace = true,
      preserveStructure = false
    } = options;

    const tables = [];

    $(selector).each((tableIndex, table) => {
      const $table = $(table);
      const tableData = {
        index: tableIndex,
        headers: [],
        rows: []
      };

      // Extract headers if requested
      if (includeHeaders) {
        $table.find('thead th, tr:first-child th').each((i, header) => {
          const text = $(header).text();
          tableData.headers.push(trimWhitespace ? text.trim() : text);
        });
      }

      // Extract data rows
      const rowSelector = includeHeaders ? 'tbody tr, tr:not(:first-child)' : 'tr';
      $table.find(rowSelector).each((rowIndex, row) => {
        const rowData = preserveStructure ? {} : [];

        $(row).find('td, th').each((cellIndex, cell) => {
          const text = $(cell).text();
          const cellValue = trimWhitespace ? text.trim() : text;

          if (preserveStructure && tableData.headers[cellIndex]) {
            rowData[tableData.headers[cellIndex]] = cellValue;
          } else {
            rowData.push ? rowData.push(cellValue) : (rowData[cellIndex] = cellValue);
          }
        });

        if ((Array.isArray(rowData) && rowData.length > 0) || 
            (!Array.isArray(rowData) && Object.keys(rowData).length > 0)) {
          tableData.rows.push(rowData);
        }
      });

      tables.push(tableData);
    });

    return tables.length === 1 ? tables[0] : tables;
  }
}

// Usage example
async function main() {
  const scraper = new TableScraper();

  try {
    const result = await scraper.scrape('https://example.com/data-table', {
      includeHeaders: true,
      preserveStructure: true
    });

    console.log('Table headers:', result.headers);
    console.log('First row:', result.rows[0]);
    console.log('Total rows:', result.rows.length);

  } catch (error) {
    console.error('Scraping failed:', error.message);
  }
}

main();

Best Practices

  1. Use Modern HTTP Clients: Prefer axios or node-fetch over deprecated request-promise
  2. Handle Errors Gracefully: Always wrap requests in try-catch blocks
  3. Validate Selectors: Check if elements exist before processing
  4. Respect Rate Limits: Add delays between requests when scraping multiple pages
  5. Clean Data: Trim whitespace and handle empty cells appropriately
  6. Preserve Structure: Consider returning objects with named properties instead of arrays

Common Issues and Solutions

  • Empty Results: Check if the table uses thead/tbody or specific classes
  • Missing Data: Verify the selector matches the actual HTML structure
  • Memory Issues: Process large tables in chunks or streams
  • Dynamic Content: Use Puppeteer for JavaScript-rendered tables
  • Encoding Problems: Specify proper encoding in axios requests

Always test your selectors and validate the scraped data structure before using it in production.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon