Scraping data from HTML tables using Selenium WebDriver is a common web scraping task that involves locating table elements, iterating through rows and columns, and extracting cell data. This guide covers modern approaches with comprehensive examples.
Python Implementation
Setup and Installation
First, install Selenium and set up WebDriver Manager for automatic driver management:
pip install selenium webdriver-manager pandas
Basic Table Scraping
Here's a complete example using modern Selenium 4 syntax:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
# Set up Chrome driver with automatic management
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Optional: run in background
driver = webdriver.Chrome(service=service, options=options)
try:
# Navigate to the page
driver.get('https://example.com/page-with-table')
# Wait for table to load (optional but recommended)
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
# Extract headers
headers = []
header_elements = table.find_elements(By.TAG_NAME, 'th')
if header_elements:
headers = [header.text.strip() for header in header_elements]
# Extract table data
rows = table.find_elements(By.TAG_NAME, 'tr')
table_data = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, 'td')
if cells: # Skip header rows that only contain th elements
row_data = [cell.text.strip() for cell in cells]
table_data.append(row_data)
# Create DataFrame for easy data manipulation
if headers and table_data:
df = pd.DataFrame(table_data, columns=headers)
else:
df = pd.DataFrame(table_data)
print(df)
# Save to CSV
df.to_csv('scraped_table.csv', index=False)
except Exception as e:
print(f"Error scraping table: {e}")
finally:
driver.quit()
Advanced Table Scraping with Multiple Selectors
For more complex tables with specific CSS classes or IDs:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_table_by_selector(driver, table_selector):
"""Scrape table data using a specific CSS selector"""
try:
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, table_selector)))
# Extract headers from thead section
headers = []
thead = table.find_element(By.TAG_NAME, 'thead')
if thead:
header_cells = thead.find_elements(By.TAG_NAME, 'th')
headers = [cell.text.strip() for cell in header_cells]
# Extract body data from tbody section
tbody = table.find_element(By.TAG_NAME, 'tbody')
rows = tbody.find_elements(By.TAG_NAME, 'tr')
table_data = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, 'td')
row_data = []
for cell in cells:
# Handle nested elements like links or spans
links = cell.find_elements(By.TAG_NAME, 'a')
if links:
row_data.append(links[0].get_attribute('href'))
else:
row_data.append(cell.text.strip())
table_data.append(row_data)
return headers, table_data
except Exception as e:
print(f"Error scraping table: {e}")
return [], []
# Usage examples
driver = webdriver.Chrome()
driver.get('https://example.com')
# Scrape table by ID
headers, data = scrape_table_by_selector(driver, '#data-table')
# Scrape table by class
headers, data = scrape_table_by_selector(driver, '.results-table')
# Scrape nested table
headers, data = scrape_table_by_selector(driver, 'div.container table.striped')
driver.quit()
Handling Pagination
For tables with pagination:
def scrape_paginated_table(driver, base_url, max_pages=None):
"""Scrape data from paginated tables"""
all_data = []
page = 1
while True:
try:
# Navigate to current page
if page == 1:
driver.get(base_url)
else:
# Click next button or navigate to page URL
next_button = driver.find_element(By.CSS_SELECTOR, 'a.next')
if not next_button.is_enabled():
break
next_button.click()
# Wait for table to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, 'table'))
)
# Extract current page data
table = driver.find_element(By.TAG_NAME, 'table')
rows = table.find_elements(By.TAG_NAME, 'tr')[1:] # Skip header
for row in rows:
cells = row.find_elements(By.TAG_NAME, 'td')
row_data = [cell.text.strip() for cell in cells]
all_data.append(row_data)
print(f"Scraped page {page}")
page += 1
if max_pages and page > max_pages:
break
except Exception as e:
print(f"Error on page {page}: {e}")
break
return all_data
JavaScript Implementation
Using Selenium WebDriver for Node.js
npm install selenium-webdriver
const { Builder, By, until } = require('selenium-webdriver');
const chrome = require('selenium-webdriver/chrome');
async function scrapeTable() {
// Set up Chrome options
const options = new chrome.Options();
options.addArguments('--headless');
const driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
try {
await driver.get('https://example.com/page-with-table');
// Wait for table to be present
const table = await driver.wait(
until.elementLocated(By.css('table')),
10000
);
// Extract headers
const headerElements = await table.findElements(By.css('th'));
const headers = await Promise.all(
headerElements.map(el => el.getText())
);
// Extract table rows
const rows = await table.findElements(By.css('tbody tr'));
const tableData = [];
for (const row of rows) {
const cells = await row.findElements(By.css('td'));
const rowData = await Promise.all(
cells.map(cell => cell.getText())
);
tableData.push(rowData);
}
// Process data
console.log('Headers:', headers);
console.log('Data:', tableData);
// Convert to CSV format
const csvData = [
headers.join(','),
...tableData.map(row => row.join(','))
].join('\n');
console.log('CSV Data:', csvData);
} catch (error) {
console.error('Error scraping table:', error);
} finally {
await driver.quit();
}
}
scrapeTable();
Best Practices and Tips
1. Wait for Dynamic Content
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Wait for table to load
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.ID, 'data-table')))
# Wait for specific number of rows
wait.until(lambda driver: len(driver.find_elements(By.CSS_SELECTOR, 'tbody tr')) >= 10)
2. Handle Different Table Structures
def extract_table_data(table):
"""Handle various table structures"""
data = []
# Try tbody first, fallback to direct tr elements
tbody = table.find_elements(By.TAG_NAME, 'tbody')
if tbody:
rows = tbody[0].find_elements(By.TAG_NAME, 'tr')
else:
rows = table.find_elements(By.TAG_NAME, 'tr')[1:] # Skip header
for row in rows:
# Handle both td and th cells
cells = row.find_elements(By.TAG_NAME, 'td')
if not cells:
cells = row.find_elements(By.TAG_NAME, 'th')
row_data = [cell.text.strip() for cell in cells]
if row_data: # Skip empty rows
data.append(row_data)
return data
3. Error Handling and Retries
import time
from selenium.common.exceptions import TimeoutException, NoSuchElementException
def scrape_with_retry(driver, url, max_retries=3):
"""Scrape table with retry logic"""
for attempt in range(max_retries):
try:
driver.get(url)
# Wait for table
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
# Extract data
return extract_table_data(table)
except (TimeoutException, NoSuchElementException) as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2) # Wait before retry
else:
raise
4. Memory Management for Large Tables
def scrape_large_table_chunked(driver, chunk_size=100):
"""Process large tables in chunks to manage memory"""
table = driver.find_element(By.TAG_NAME, 'table')
rows = table.find_elements(By.TAG_NAME, 'tr')
# Process in chunks
for i in range(0, len(rows), chunk_size):
chunk = rows[i:i + chunk_size]
chunk_data = []
for row in chunk:
cells = row.find_elements(By.TAG_NAME, 'td')
row_data = [cell.text.strip() for cell in cells]
chunk_data.append(row_data)
# Process chunk (save to file, database, etc.)
yield chunk_data
Common Challenges and Solutions
Handling JavaScript-Rendered Tables
Some tables are populated by JavaScript after page load:
# Wait for specific content to appear
wait.until(EC.text_to_be_present_in_element((By.ID, 'table-status'), 'Loaded'))
# Or wait for minimum number of rows
wait.until(lambda driver: len(driver.find_elements(By.CSS_SELECTOR, 'tbody tr')) > 0)
Extracting Links and Attributes
for row in rows:
cells = row.find_elements(By.TAG_NAME, 'td')
row_data = []
for cell in cells:
# Check for links
links = cell.find_elements(By.TAG_NAME, 'a')
if links:
link_data = {
'text': links[0].text,
'href': links[0].get_attribute('href')
}
row_data.append(link_data)
else:
row_data.append(cell.text.strip())
Performance Optimization
# Use headless mode for faster execution
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Disable images and CSS for faster loading
prefs = {
'profile.managed_default_content_settings.images': 2,
'profile.managed_default_content_settings.stylesheets': 2
}
options.add_experimental_option('prefs', prefs)
Remember to always respect website terms of service, implement appropriate delays between requests, and consider using the website's API if available for better performance and reliability.