Scraping data from HTML tables is a common web scraping task. Selenium provides powerful tools to locate, extract, and process table data across different browsers. This guide covers both Python and JavaScript implementations with practical examples.
Sample HTML Table
Let's work with this example table structure:
<table id="employeeTable" class="data-table">
<thead>
<tr>
<th>Name</th>
<th>Position</th>
<th>Salary</th>
<th>Department</th>
</tr>
</thead>
<tbody>
<tr>
<td>John Doe</td>
<td>Software Engineer</td>
<td>$75,000</td>
<td>IT</td>
</tr>
<tr>
<td>Jane Smith</td>
<td>Marketing Manager</td>
<td>$65,000</td>
<td>Marketing</td>
</tr>
<tr>
<td>Bob Johnson</td>
<td>Data Analyst</td>
<td>$60,000</td>
<td>IT</td>
</tr>
</tbody>
</table>
Python Implementation
Basic Table Scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
# Setup WebDriver (Chrome recommended for stability)
driver = webdriver.Chrome()
driver.get('https://example.com/employees')
try:
# Wait for table to load
table = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "employeeTable"))
)
# Get all rows (excluding header)
rows = table.find_elements(By.XPATH, ".//tbody/tr")
# Extract data
employee_data = []
for row in rows:
cols = row.find_elements(By.TAG_NAME, "td")
if cols: # Skip empty rows
employee = {
'name': cols[0].text,
'position': cols[1].text,
'salary': cols[2].text,
'department': cols[3].text
}
employee_data.append(employee)
# Print results
for employee in employee_data:
print(f"{employee['name']} - {employee['position']} - {employee['salary']}")
finally:
driver.quit()
Advanced Table Handling with Error Checking
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
def scrape_table_data(url, table_selector, output_file=None):
"""
Scrape table data with comprehensive error handling
"""
driver = webdriver.Chrome()
try:
driver.get(url)
# Wait for table to be present
table = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, table_selector))
)
# Extract headers
headers = []
try:
header_row = table.find_element(By.XPATH, ".//thead/tr")
header_cells = header_row.find_elements(By.TAG_NAME, "th")
headers = [cell.text.strip() for cell in header_cells]
except NoSuchElementException:
# Fallback: use first row as headers
first_row = table.find_element(By.XPATH, ".//tr[1]")
header_cells = first_row.find_elements(By.TAG_NAME, "td")
headers = [cell.text.strip() for cell in header_cells]
# Extract data rows
data_rows = table.find_elements(By.XPATH, ".//tbody/tr")
if not data_rows: # Fallback if no tbody
data_rows = table.find_elements(By.XPATH, ".//tr[position()>1]")
table_data = []
for row in data_rows:
cells = row.find_elements(By.TAG_NAME, "td")
if cells and len(cells) == len(headers):
row_data = {}
for i, cell in enumerate(cells):
row_data[headers[i]] = cell.text.strip()
table_data.append(row_data)
# Save to CSV if specified
if output_file and table_data:
import csv
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
writer.writerows(table_data)
return table_data
except TimeoutException:
print(f"Timeout: Table not found within 15 seconds")
return []
except Exception as e:
print(f"Error occurred: {str(e)}")
return []
finally:
driver.quit()
# Usage
data = scrape_table_data('https://example.com/employees', '#employeeTable', 'employees.csv')
print(f"Scraped {len(data)} rows of data")
JavaScript Implementation
Basic Node.js Script
const {Builder, By, until} = require('selenium-webdriver');
async function scrapeTableData() {
let driver = await new Builder().forBrowser('chrome').build();
try {
await driver.get('https://example.com/employees');
// Wait for table to load
await driver.wait(until.elementLocated(By.id('employeeTable')), 10000);
const table = await driver.findElement(By.id('employeeTable'));
const rows = await table.findElements(By.xpath('.//tbody/tr'));
const employeeData = [];
for (let row of rows) {
const cols = await row.findElements(By.tagName('td'));
if (cols.length > 0) {
const employee = {
name: await cols[0].getText(),
position: await cols[1].getText(),
salary: await cols[2].getText(),
department: await cols[3].getText()
};
employeeData.push(employee);
}
}
// Display results
employeeData.forEach(employee => {
console.log(`${employee.name} - ${employee.position} - ${employee.salary}`);
});
return employeeData;
} finally {
await driver.quit();
}
}
// Run the scraper
scrapeTableData().catch(console.error);
Advanced JavaScript Implementation
const {Builder, By, until} = require('selenium-webdriver');
const fs = require('fs');
class TableScraper {
constructor(browserName = 'chrome') {
this.driver = null;
this.browserName = browserName;
}
async init() {
this.driver = await new Builder().forBrowser(this.browserName).build();
}
async scrapeTable(url, tableSelector, options = {}) {
if (!this.driver) await this.init();
try {
await this.driver.get(url);
// Wait for table with timeout
const timeout = options.timeout || 15000;
await this.driver.wait(until.elementLocated(By.css(tableSelector)), timeout);
const table = await this.driver.findElement(By.css(tableSelector));
// Extract headers
let headers = [];
try {
const headerRow = await table.findElement(By.xpath('.//thead/tr'));
const headerCells = await headerRow.findElements(By.tagName('th'));
headers = await Promise.all(headerCells.map(cell => cell.getText()));
} catch (e) {
// Fallback to first row
const firstRow = await table.findElement(By.xpath('.//tr[1]'));
const cells = await firstRow.findElements(By.tagName('td'));
headers = await Promise.all(cells.map(cell => cell.getText()));
}
// Extract data rows
let dataRows;
try {
dataRows = await table.findElements(By.xpath('.//tbody/tr'));
} catch (e) {
dataRows = await table.findElements(By.xpath('.//tr[position()>1]'));
}
const tableData = [];
for (let row of dataRows) {
const cells = await row.findElements(By.tagName('td'));
if (cells.length === headers.length) {
const rowData = {};
const cellTexts = await Promise.all(cells.map(cell => cell.getText()));
headers.forEach((header, index) => {
rowData[header.trim()] = cellTexts[index].trim();
});
tableData.push(rowData);
}
}
// Save to JSON if specified
if (options.outputFile) {
fs.writeFileSync(options.outputFile, JSON.stringify(tableData, null, 2));
}
return tableData;
} catch (error) {
console.error('Scraping error:', error.message);
return [];
}
}
async close() {
if (this.driver) {
await this.driver.quit();
}
}
}
// Usage
async function main() {
const scraper = new TableScraper();
try {
const data = await scraper.scrapeTable(
'https://example.com/employees',
'#employeeTable',
{ outputFile: 'employees.json', timeout: 20000 }
);
console.log(`Successfully scraped ${data.length} rows`);
} finally {
await scraper.close();
}
}
main().catch(console.error);
Alternative Locator Strategies
When tables don't have IDs, use these alternative selectors:
# By class name
table = driver.find_element(By.CLASS_NAME, "data-table")
# By CSS selector
table = driver.find_element(By.CSS_SELECTOR, "table.employee-list")
# By XPath (most flexible)
table = driver.find_element(By.XPATH, "//table[contains(@class, 'data')]")
# By position (nth table on page)
table = driver.find_element(By.XPATH, "(//table)[2]") # Second table
Handling Dynamic Tables
For tables that load data via AJAX:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Wait for specific number of rows
WebDriverWait(driver, 10).until(
lambda d: len(d.find_elements(By.XPATH, "//table[@id='myTable']//tbody/tr")) >= 5
)
# Wait for loading indicator to disappear
WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((By.CLASS_NAME, "loading-spinner"))
)
Best Practices
- Always use explicit waits instead of time.sleep()
- Handle missing elements gracefully with try-catch blocks
- Use specific selectors to avoid selecting wrong tables
- Clean extracted text by stripping whitespace
- Validate data structure before processing
- Implement proper error handling for network issues
- Close browser instances to free resources
Ethical Considerations
- Check
robots.txt
before scraping:https://example.com/robots.txt
- Respect rate limits and don't overload servers
- Consider using official APIs when available
- Be mindful of website terms of service
- Implement delays between requests for courtesy