Table of contents

How can I scrape data from a table using Selenium?

Scraping data from HTML tables is a common web scraping task. Selenium provides powerful tools to locate, extract, and process table data across different browsers. This guide covers both Python and JavaScript implementations with practical examples.

Sample HTML Table

Let's work with this example table structure:

<table id="employeeTable" class="data-table">
    <thead>
        <tr>
            <th>Name</th>
            <th>Position</th>
            <th>Salary</th>
            <th>Department</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>John Doe</td>
            <td>Software Engineer</td>
            <td>$75,000</td>
            <td>IT</td>
        </tr>
        <tr>
            <td>Jane Smith</td>
            <td>Marketing Manager</td>
            <td>$65,000</td>
            <td>Marketing</td>
        </tr>
        <tr>
            <td>Bob Johnson</td>
            <td>Data Analyst</td>
            <td>$60,000</td>
            <td>IT</td>
        </tr>
    </tbody>
</table>

Python Implementation

Basic Table Scraping

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv

# Setup WebDriver (Chrome recommended for stability)
driver = webdriver.Chrome()
driver.get('https://example.com/employees')

try:
    # Wait for table to load
    table = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "employeeTable"))
    )

    # Get all rows (excluding header)
    rows = table.find_elements(By.XPATH, ".//tbody/tr")

    # Extract data
    employee_data = []
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if cols:  # Skip empty rows
            employee = {
                'name': cols[0].text,
                'position': cols[1].text,
                'salary': cols[2].text,
                'department': cols[3].text
            }
            employee_data.append(employee)

    # Print results
    for employee in employee_data:
        print(f"{employee['name']} - {employee['position']} - {employee['salary']}")

finally:
    driver.quit()

Advanced Table Handling with Error Checking

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def scrape_table_data(url, table_selector, output_file=None):
    """
    Scrape table data with comprehensive error handling
    """
    driver = webdriver.Chrome()

    try:
        driver.get(url)

        # Wait for table to be present
        table = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, table_selector))
        )

        # Extract headers
        headers = []
        try:
            header_row = table.find_element(By.XPATH, ".//thead/tr")
            header_cells = header_row.find_elements(By.TAG_NAME, "th")
            headers = [cell.text.strip() for cell in header_cells]
        except NoSuchElementException:
            # Fallback: use first row as headers
            first_row = table.find_element(By.XPATH, ".//tr[1]")
            header_cells = first_row.find_elements(By.TAG_NAME, "td")
            headers = [cell.text.strip() for cell in header_cells]

        # Extract data rows
        data_rows = table.find_elements(By.XPATH, ".//tbody/tr")
        if not data_rows:  # Fallback if no tbody
            data_rows = table.find_elements(By.XPATH, ".//tr[position()>1]")

        table_data = []
        for row in data_rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            if cells and len(cells) == len(headers):
                row_data = {}
                for i, cell in enumerate(cells):
                    row_data[headers[i]] = cell.text.strip()
                table_data.append(row_data)

        # Save to CSV if specified
        if output_file and table_data:
            import csv
            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=headers)
                writer.writeheader()
                writer.writerows(table_data)

        return table_data

    except TimeoutException:
        print(f"Timeout: Table not found within 15 seconds")
        return []
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return []
    finally:
        driver.quit()

# Usage
data = scrape_table_data('https://example.com/employees', '#employeeTable', 'employees.csv')
print(f"Scraped {len(data)} rows of data")

JavaScript Implementation

Basic Node.js Script

const {Builder, By, until} = require('selenium-webdriver');

async function scrapeTableData() {
    let driver = await new Builder().forBrowser('chrome').build();

    try {
        await driver.get('https://example.com/employees');

        // Wait for table to load
        await driver.wait(until.elementLocated(By.id('employeeTable')), 10000);

        const table = await driver.findElement(By.id('employeeTable'));
        const rows = await table.findElements(By.xpath('.//tbody/tr'));

        const employeeData = [];

        for (let row of rows) {
            const cols = await row.findElements(By.tagName('td'));

            if (cols.length > 0) {
                const employee = {
                    name: await cols[0].getText(),
                    position: await cols[1].getText(),
                    salary: await cols[2].getText(),
                    department: await cols[3].getText()
                };
                employeeData.push(employee);
            }
        }

        // Display results
        employeeData.forEach(employee => {
            console.log(`${employee.name} - ${employee.position} - ${employee.salary}`);
        });

        return employeeData;

    } finally {
        await driver.quit();
    }
}

// Run the scraper
scrapeTableData().catch(console.error);

Advanced JavaScript Implementation

const {Builder, By, until} = require('selenium-webdriver');
const fs = require('fs');

class TableScraper {
    constructor(browserName = 'chrome') {
        this.driver = null;
        this.browserName = browserName;
    }

    async init() {
        this.driver = await new Builder().forBrowser(this.browserName).build();
    }

    async scrapeTable(url, tableSelector, options = {}) {
        if (!this.driver) await this.init();

        try {
            await this.driver.get(url);

            // Wait for table with timeout
            const timeout = options.timeout || 15000;
            await this.driver.wait(until.elementLocated(By.css(tableSelector)), timeout);

            const table = await this.driver.findElement(By.css(tableSelector));

            // Extract headers
            let headers = [];
            try {
                const headerRow = await table.findElement(By.xpath('.//thead/tr'));
                const headerCells = await headerRow.findElements(By.tagName('th'));
                headers = await Promise.all(headerCells.map(cell => cell.getText()));
            } catch (e) {
                // Fallback to first row
                const firstRow = await table.findElement(By.xpath('.//tr[1]'));
                const cells = await firstRow.findElements(By.tagName('td'));
                headers = await Promise.all(cells.map(cell => cell.getText()));
            }

            // Extract data rows
            let dataRows;
            try {
                dataRows = await table.findElements(By.xpath('.//tbody/tr'));
            } catch (e) {
                dataRows = await table.findElements(By.xpath('.//tr[position()>1]'));
            }

            const tableData = [];
            for (let row of dataRows) {
                const cells = await row.findElements(By.tagName('td'));
                if (cells.length === headers.length) {
                    const rowData = {};
                    const cellTexts = await Promise.all(cells.map(cell => cell.getText()));

                    headers.forEach((header, index) => {
                        rowData[header.trim()] = cellTexts[index].trim();
                    });

                    tableData.push(rowData);
                }
            }

            // Save to JSON if specified
            if (options.outputFile) {
                fs.writeFileSync(options.outputFile, JSON.stringify(tableData, null, 2));
            }

            return tableData;

        } catch (error) {
            console.error('Scraping error:', error.message);
            return [];
        }
    }

    async close() {
        if (this.driver) {
            await this.driver.quit();
        }
    }
}

// Usage
async function main() {
    const scraper = new TableScraper();

    try {
        const data = await scraper.scrapeTable(
            'https://example.com/employees',
            '#employeeTable',
            { outputFile: 'employees.json', timeout: 20000 }
        );

        console.log(`Successfully scraped ${data.length} rows`);
    } finally {
        await scraper.close();
    }
}

main().catch(console.error);

Alternative Locator Strategies

When tables don't have IDs, use these alternative selectors:

# By class name
table = driver.find_element(By.CLASS_NAME, "data-table")

# By CSS selector
table = driver.find_element(By.CSS_SELECTOR, "table.employee-list")

# By XPath (most flexible)
table = driver.find_element(By.XPATH, "//table[contains(@class, 'data')]")

# By position (nth table on page)
table = driver.find_element(By.XPATH, "(//table)[2]")  # Second table

Handling Dynamic Tables

For tables that load data via AJAX:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Wait for specific number of rows
WebDriverWait(driver, 10).until(
    lambda d: len(d.find_elements(By.XPATH, "//table[@id='myTable']//tbody/tr")) >= 5
)

# Wait for loading indicator to disappear
WebDriverWait(driver, 10).until_not(
    EC.presence_of_element_located((By.CLASS_NAME, "loading-spinner"))
)

Best Practices

  1. Always use explicit waits instead of time.sleep()
  2. Handle missing elements gracefully with try-catch blocks
  3. Use specific selectors to avoid selecting wrong tables
  4. Clean extracted text by stripping whitespace
  5. Validate data structure before processing
  6. Implement proper error handling for network issues
  7. Close browser instances to free resources

Ethical Considerations

  • Check robots.txt before scraping: https://example.com/robots.txt
  • Respect rate limits and don't overload servers
  • Consider using official APIs when available
  • Be mindful of website terms of service
  • Implement delays between requests for courtesy

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon