Table of contents

How can I scrape data from a table using Selenium WebDriver?

Scraping data from HTML tables using Selenium WebDriver is a common web scraping task that involves locating table elements, iterating through rows and columns, and extracting cell data. This guide covers modern approaches with comprehensive examples.

Python Implementation

Setup and Installation

First, install Selenium and set up WebDriver Manager for automatic driver management:

pip install selenium webdriver-manager pandas

Basic Table Scraping

Here's a complete example using modern Selenium 4 syntax:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Set up Chrome driver with automatic management
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Optional: run in background
driver = webdriver.Chrome(service=service, options=options)

try:
    # Navigate to the page
    driver.get('https://example.com/page-with-table')

    # Wait for table to load (optional but recommended)
    wait = WebDriverWait(driver, 10)
    table = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))

    # Extract headers
    headers = []
    header_elements = table.find_elements(By.TAG_NAME, 'th')
    if header_elements:
        headers = [header.text.strip() for header in header_elements]

    # Extract table data
    rows = table.find_elements(By.TAG_NAME, 'tr')
    table_data = []

    for row in rows:
        cells = row.find_elements(By.TAG_NAME, 'td')
        if cells:  # Skip header rows that only contain th elements
            row_data = [cell.text.strip() for cell in cells]
            table_data.append(row_data)

    # Create DataFrame for easy data manipulation
    if headers and table_data:
        df = pd.DataFrame(table_data, columns=headers)
    else:
        df = pd.DataFrame(table_data)

    print(df)

    # Save to CSV
    df.to_csv('scraped_table.csv', index=False)

except Exception as e:
    print(f"Error scraping table: {e}")
finally:
    driver.quit()

Advanced Table Scraping with Multiple Selectors

For more complex tables with specific CSS classes or IDs:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_table_by_selector(driver, table_selector):
    """Scrape table data using a specific CSS selector"""
    try:
        wait = WebDriverWait(driver, 10)
        table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, table_selector)))

        # Extract headers from thead section
        headers = []
        thead = table.find_element(By.TAG_NAME, 'thead')
        if thead:
            header_cells = thead.find_elements(By.TAG_NAME, 'th')
            headers = [cell.text.strip() for cell in header_cells]

        # Extract body data from tbody section
        tbody = table.find_element(By.TAG_NAME, 'tbody')
        rows = tbody.find_elements(By.TAG_NAME, 'tr')

        table_data = []
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, 'td')
            row_data = []

            for cell in cells:
                # Handle nested elements like links or spans
                links = cell.find_elements(By.TAG_NAME, 'a')
                if links:
                    row_data.append(links[0].get_attribute('href'))
                else:
                    row_data.append(cell.text.strip())

            table_data.append(row_data)

        return headers, table_data

    except Exception as e:
        print(f"Error scraping table: {e}")
        return [], []

# Usage examples
driver = webdriver.Chrome()
driver.get('https://example.com')

# Scrape table by ID
headers, data = scrape_table_by_selector(driver, '#data-table')

# Scrape table by class
headers, data = scrape_table_by_selector(driver, '.results-table')

# Scrape nested table
headers, data = scrape_table_by_selector(driver, 'div.container table.striped')

driver.quit()

Handling Pagination

For tables with pagination:

def scrape_paginated_table(driver, base_url, max_pages=None):
    """Scrape data from paginated tables"""
    all_data = []
    page = 1

    while True:
        try:
            # Navigate to current page
            if page == 1:
                driver.get(base_url)
            else:
                # Click next button or navigate to page URL
                next_button = driver.find_element(By.CSS_SELECTOR, 'a.next')
                if not next_button.is_enabled():
                    break
                next_button.click()

            # Wait for table to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, 'table'))
            )

            # Extract current page data
            table = driver.find_element(By.TAG_NAME, 'table')
            rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # Skip header

            for row in rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                row_data = [cell.text.strip() for cell in cells]
                all_data.append(row_data)

            print(f"Scraped page {page}")
            page += 1

            if max_pages and page > max_pages:
                break

        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    return all_data

JavaScript Implementation

Using Selenium WebDriver for Node.js

npm install selenium-webdriver
const { Builder, By, until } = require('selenium-webdriver');
const chrome = require('selenium-webdriver/chrome');

async function scrapeTable() {
    // Set up Chrome options
    const options = new chrome.Options();
    options.addArguments('--headless');

    const driver = await new Builder()
        .forBrowser('chrome')
        .setChromeOptions(options)
        .build();

    try {
        await driver.get('https://example.com/page-with-table');

        // Wait for table to be present
        const table = await driver.wait(
            until.elementLocated(By.css('table')), 
            10000
        );

        // Extract headers
        const headerElements = await table.findElements(By.css('th'));
        const headers = await Promise.all(
            headerElements.map(el => el.getText())
        );

        // Extract table rows
        const rows = await table.findElements(By.css('tbody tr'));
        const tableData = [];

        for (const row of rows) {
            const cells = await row.findElements(By.css('td'));
            const rowData = await Promise.all(
                cells.map(cell => cell.getText())
            );
            tableData.push(rowData);
        }

        // Process data
        console.log('Headers:', headers);
        console.log('Data:', tableData);

        // Convert to CSV format
        const csvData = [
            headers.join(','),
            ...tableData.map(row => row.join(','))
        ].join('\n');

        console.log('CSV Data:', csvData);

    } catch (error) {
        console.error('Error scraping table:', error);
    } finally {
        await driver.quit();
    }
}

scrapeTable();

Best Practices and Tips

1. Wait for Dynamic Content

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Wait for table to load
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.ID, 'data-table')))

# Wait for specific number of rows
wait.until(lambda driver: len(driver.find_elements(By.CSS_SELECTOR, 'tbody tr')) >= 10)

2. Handle Different Table Structures

def extract_table_data(table):
    """Handle various table structures"""
    data = []

    # Try tbody first, fallback to direct tr elements
    tbody = table.find_elements(By.TAG_NAME, 'tbody')
    if tbody:
        rows = tbody[0].find_elements(By.TAG_NAME, 'tr')
    else:
        rows = table.find_elements(By.TAG_NAME, 'tr')[1:]  # Skip header

    for row in rows:
        # Handle both td and th cells
        cells = row.find_elements(By.TAG_NAME, 'td')
        if not cells:
            cells = row.find_elements(By.TAG_NAME, 'th')

        row_data = [cell.text.strip() for cell in cells]
        if row_data:  # Skip empty rows
            data.append(row_data)

    return data

3. Error Handling and Retries

import time
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def scrape_with_retry(driver, url, max_retries=3):
    """Scrape table with retry logic"""
    for attempt in range(max_retries):
        try:
            driver.get(url)

            # Wait for table
            wait = WebDriverWait(driver, 10)
            table = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))

            # Extract data
            return extract_table_data(table)

        except (TimeoutException, NoSuchElementException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)  # Wait before retry
            else:
                raise

4. Memory Management for Large Tables

def scrape_large_table_chunked(driver, chunk_size=100):
    """Process large tables in chunks to manage memory"""
    table = driver.find_element(By.TAG_NAME, 'table')
    rows = table.find_elements(By.TAG_NAME, 'tr')

    # Process in chunks
    for i in range(0, len(rows), chunk_size):
        chunk = rows[i:i + chunk_size]
        chunk_data = []

        for row in chunk:
            cells = row.find_elements(By.TAG_NAME, 'td')
            row_data = [cell.text.strip() for cell in cells]
            chunk_data.append(row_data)

        # Process chunk (save to file, database, etc.)
        yield chunk_data

Common Challenges and Solutions

Handling JavaScript-Rendered Tables

Some tables are populated by JavaScript after page load:

# Wait for specific content to appear
wait.until(EC.text_to_be_present_in_element((By.ID, 'table-status'), 'Loaded'))

# Or wait for minimum number of rows
wait.until(lambda driver: len(driver.find_elements(By.CSS_SELECTOR, 'tbody tr')) > 0)

Extracting Links and Attributes

for row in rows:
    cells = row.find_elements(By.TAG_NAME, 'td')
    row_data = []

    for cell in cells:
        # Check for links
        links = cell.find_elements(By.TAG_NAME, 'a')
        if links:
            link_data = {
                'text': links[0].text,
                'href': links[0].get_attribute('href')
            }
            row_data.append(link_data)
        else:
            row_data.append(cell.text.strip())

Performance Optimization

# Use headless mode for faster execution
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Disable images and CSS for faster loading
prefs = {
    'profile.managed_default_content_settings.images': 2,
    'profile.managed_default_content_settings.stylesheets': 2
}
options.add_experimental_option('prefs', prefs)

Remember to always respect website terms of service, implement appropriate delays between requests, and consider using the website's API if available for better performance and reliability.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon