Extracting tables from HTML pages is a common web scraping task. The lxml
library provides powerful tools for parsing HTML and extracting table data using XPath selectors. This guide shows you how to extract tables from HTML pages effectively.
Installation
First, install the required libraries:
pip install lxml requests
Basic Table Extraction
Parsing HTML Content
Start by parsing your HTML content with lxml.html
:
from lxml import html
import requests
# Method 1: Parse HTML string
html_content = """
<html>
<body>
<table id="products">
<thead>
<tr>
<th>Product</th>
<th>Price</th>
<th>Stock</th>
</tr>
</thead>
<tbody>
<tr>
<td>Laptop</td>
<td>$999</td>
<td>15</td>
</tr>
<tr>
<td>Mouse</td>
<td>$25</td>
<td>50</td>
</tr>
</tbody>
</table>
</body>
</html>
"""
tree = html.fromstring(html_content)
Fetching from URL
# Method 2: Fetch from URL
url = 'https://example.com/page-with-tables.html'
response = requests.get(url)
if response.status_code == 200:
tree = html.fromstring(response.content)
else:
raise Exception(f"Failed to retrieve webpage: {response.status_code}")
Extracting Table Data
Simple Table Extraction
# Find all tables
tables = tree.xpath('//table')
for i, table in enumerate(tables):
print(f"Table {i + 1}:")
# Extract all rows
rows = table.xpath('.//tr')
for row_index, row in enumerate(rows):
# Get all cell text (both th and td)
cells = row.xpath('.//th/text() | .//td/text()')
print(f"Row {row_index + 1}: {cells}")
print()
Structured Table Extraction
def extract_table_data(table):
"""Extract table data into a structured format"""
data = []
rows = table.xpath('.//tr')
# Extract headers from first row or thead
headers = []
header_row = table.xpath('.//thead/tr | .//tr[1]')[0]
headers = header_row.xpath('.//th/text() | .//td/text()')
# Extract data rows (skip header if it's in tbody)
data_rows = table.xpath('.//tbody/tr | .//tr[position()>1]')
for row in data_rows:
cells = row.xpath('.//td/text() | .//th/text()')
if cells: # Skip empty rows
row_data = dict(zip(headers, cells))
data.append(row_data)
return headers, data
# Usage
tables = tree.xpath('//table')
for i, table in enumerate(tables):
headers, data = extract_table_data(table)
print(f"Table {i + 1} Headers: {headers}")
for row in data:
print(row)
Advanced Table Extraction
Handling Complex Tables with Attributes
def extract_table_with_attributes(table):
"""Extract table data including cell attributes"""
data = []
rows = table.xpath('.//tr')
for row in rows:
row_data = []
cells = row.xpath('.//td | .//th')
for cell in cells:
cell_info = {
'text': cell.text_content().strip(),
'colspan': cell.get('colspan', '1'),
'rowspan': cell.get('rowspan', '1'),
'class': cell.get('class', ''),
}
row_data.append(cell_info)
data.append(row_data)
return data
# Usage for complex tables
complex_tables = tree.xpath('//table')
for table in complex_tables:
table_data = extract_table_with_attributes(table)
for row in table_data:
print(row)
Targeting Specific Tables
# Select table by ID
specific_table = tree.xpath('//table[@id="products"]')[0]
# Select table by class
tables_by_class = tree.xpath('//table[@class="data-table"]')
# Select table containing specific text
tables_with_text = tree.xpath('//table[.//th[contains(text(), "Price")]]')
# Select nth table
second_table = tree.xpath('//table[2]')
Converting to Popular Data Formats
Convert to Pandas DataFrame
import pandas as pd
def table_to_dataframe(table):
"""Convert lxml table to pandas DataFrame"""
headers, data = extract_table_data(table)
return pd.DataFrame(data)
# Usage
tables = tree.xpath('//table')
if tables:
df = table_to_dataframe(tables[0])
print(df)
# Save to CSV
df.to_csv('extracted_table.csv', index=False)
Convert to CSV
import csv
def table_to_csv(table, filename):
"""Convert table to CSV file"""
headers, data = extract_table_data(table)
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
writer.writerows(data)
# Usage
tables = tree.xpath('//table')
if tables:
table_to_csv(tables[0], 'table_data.csv')
Handling Edge Cases
Tables with Missing or Irregular Structure
def robust_table_extraction(table):
"""Handle tables with irregular structure"""
all_data = []
rows = table.xpath('.//tr')
max_cols = 0
# First pass: determine maximum number of columns
for row in rows:
cells = row.xpath('.//td | .//th')
max_cols = max(max_cols, len(cells))
# Second pass: extract data, padding short rows
for row in rows:
cells = row.xpath('.//td | .//th')
row_data = [cell.text_content().strip() for cell in cells]
# Pad short rows with empty strings
while len(row_data) < max_cols:
row_data.append('')
all_data.append(row_data)
return all_data
Handling Nested Elements
def extract_with_nested_elements(table):
"""Extract text from cells containing nested elements"""
data = []
rows = table.xpath('.//tr')
for row in rows:
cells = row.xpath('.//td | .//th')
row_data = []
for cell in cells:
# Get all text content, including from nested elements
text_content = cell.text_content().strip()
# Or get specific nested elements
links = cell.xpath('.//a/@href') # Extract links
images = cell.xpath('.//img/@src') # Extract image sources
cell_data = {
'text': text_content,
'links': links,
'images': images
}
row_data.append(cell_data)
data.append(row_data)
return data
Complete Example
Here's a complete example that combines all the concepts:
from lxml import html
import requests
import pandas as pd
def scrape_tables_from_url(url):
"""Complete table scraping function"""
try:
response = requests.get(url)
response.raise_for_status()
tree = html.fromstring(response.content)
tables = tree.xpath('//table')
extracted_tables = []
for i, table in enumerate(tables):
try:
headers, data = extract_table_data(table)
df = pd.DataFrame(data)
table_info = {
'table_index': i,
'headers': headers,
'data': data,
'dataframe': df,
'shape': df.shape
}
extracted_tables.append(table_info)
except Exception as e:
print(f"Error processing table {i}: {e}")
return extracted_tables
except requests.RequestException as e:
print(f"Error fetching URL: {e}")
return []
# Usage
url = "https://example.com/page-with-tables"
tables = scrape_tables_from_url(url)
for table_info in tables:
print(f"Table {table_info['table_index']} shape: {table_info['shape']}")
print(table_info['dataframe'].head())
print()
Best Practices
- Error Handling: Always wrap table extraction in try-except blocks
- Data Validation: Check for empty cells and handle missing data appropriately
- Performance: For large tables, consider processing data in chunks
- Memory Management: Use iterparse for very large HTML documents
- Robustness: Handle variations in table structure gracefully
Legal and Ethical Considerations
Always ensure you have permission to scrape the target website. Check the robots.txt
file and terms of service. Consider rate limiting your requests to avoid overwhelming the server, and respect the website's scraping policies.