How do I use lxml to extract tables from an HTML page?

Extracting tables from HTML pages is a common web scraping task. The lxml library provides powerful tools for parsing HTML and extracting table data using XPath selectors. This guide shows you how to extract tables from HTML pages effectively.

Installation

First, install the required libraries:

pip install lxml requests

Basic Table Extraction

Parsing HTML Content

Start by parsing your HTML content with lxml.html:

from lxml import html
import requests

# Method 1: Parse HTML string
html_content = """
<html>
  <body>
    <table id="products">
      <thead>
        <tr>
          <th>Product</th>
          <th>Price</th>
          <th>Stock</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Laptop</td>
          <td>$999</td>
          <td>15</td>
        </tr>
        <tr>
          <td>Mouse</td>
          <td>$25</td>
          <td>50</td>
        </tr>
      </tbody>
    </table>
  </body>
</html>
"""

tree = html.fromstring(html_content)

Fetching from URL

# Method 2: Fetch from URL
url = 'https://example.com/page-with-tables.html'
response = requests.get(url)

if response.status_code == 200:
    tree = html.fromstring(response.content)
else:
    raise Exception(f"Failed to retrieve webpage: {response.status_code}")

Extracting Table Data

Simple Table Extraction

# Find all tables
tables = tree.xpath('//table')

for i, table in enumerate(tables):
    print(f"Table {i + 1}:")

    # Extract all rows
    rows = table.xpath('.//tr')

    for row_index, row in enumerate(rows):
        # Get all cell text (both th and td)
        cells = row.xpath('.//th/text() | .//td/text()')
        print(f"Row {row_index + 1}: {cells}")
    print()

Structured Table Extraction

def extract_table_data(table):
    """Extract table data into a structured format"""
    data = []
    rows = table.xpath('.//tr')

    # Extract headers from first row or thead
    headers = []
    header_row = table.xpath('.//thead/tr | .//tr[1]')[0]
    headers = header_row.xpath('.//th/text() | .//td/text()')

    # Extract data rows (skip header if it's in tbody)
    data_rows = table.xpath('.//tbody/tr | .//tr[position()>1]')

    for row in data_rows:
        cells = row.xpath('.//td/text() | .//th/text()')
        if cells:  # Skip empty rows
            row_data = dict(zip(headers, cells))
            data.append(row_data)

    return headers, data

# Usage
tables = tree.xpath('//table')
for i, table in enumerate(tables):
    headers, data = extract_table_data(table)
    print(f"Table {i + 1} Headers: {headers}")
    for row in data:
        print(row)

Advanced Table Extraction

Handling Complex Tables with Attributes

def extract_table_with_attributes(table):
    """Extract table data including cell attributes"""
    data = []
    rows = table.xpath('.//tr')

    for row in rows:
        row_data = []
        cells = row.xpath('.//td | .//th')

        for cell in cells:
            cell_info = {
                'text': cell.text_content().strip(),
                'colspan': cell.get('colspan', '1'),
                'rowspan': cell.get('rowspan', '1'),
                'class': cell.get('class', ''),
            }
            row_data.append(cell_info)

        data.append(row_data)

    return data

# Usage for complex tables
complex_tables = tree.xpath('//table')
for table in complex_tables:
    table_data = extract_table_with_attributes(table)
    for row in table_data:
        print(row)

Targeting Specific Tables

# Select table by ID
specific_table = tree.xpath('//table[@id="products"]')[0]

# Select table by class
tables_by_class = tree.xpath('//table[@class="data-table"]')

# Select table containing specific text
tables_with_text = tree.xpath('//table[.//th[contains(text(), "Price")]]')

# Select nth table
second_table = tree.xpath('//table[2]')

Converting to Popular Data Formats

Convert to Pandas DataFrame

import pandas as pd

def table_to_dataframe(table):
    """Convert lxml table to pandas DataFrame"""
    headers, data = extract_table_data(table)
    return pd.DataFrame(data)

# Usage
tables = tree.xpath('//table')
if tables:
    df = table_to_dataframe(tables[0])
    print(df)
    # Save to CSV
    df.to_csv('extracted_table.csv', index=False)

Convert to CSV

import csv

def table_to_csv(table, filename):
    """Convert table to CSV file"""
    headers, data = extract_table_data(table)

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data)

# Usage
tables = tree.xpath('//table')
if tables:
    table_to_csv(tables[0], 'table_data.csv')

Handling Edge Cases

Tables with Missing or Irregular Structure

def robust_table_extraction(table):
    """Handle tables with irregular structure"""
    all_data = []
    rows = table.xpath('.//tr')

    max_cols = 0
    # First pass: determine maximum number of columns
    for row in rows:
        cells = row.xpath('.//td | .//th')
        max_cols = max(max_cols, len(cells))

    # Second pass: extract data, padding short rows
    for row in rows:
        cells = row.xpath('.//td | .//th')
        row_data = [cell.text_content().strip() for cell in cells]

        # Pad short rows with empty strings
        while len(row_data) < max_cols:
            row_data.append('')

        all_data.append(row_data)

    return all_data

Handling Nested Elements

def extract_with_nested_elements(table):
    """Extract text from cells containing nested elements"""
    data = []
    rows = table.xpath('.//tr')

    for row in rows:
        cells = row.xpath('.//td | .//th')
        row_data = []

        for cell in cells:
            # Get all text content, including from nested elements
            text_content = cell.text_content().strip()

            # Or get specific nested elements
            links = cell.xpath('.//a/@href')  # Extract links
            images = cell.xpath('.//img/@src')  # Extract image sources

            cell_data = {
                'text': text_content,
                'links': links,
                'images': images
            }
            row_data.append(cell_data)

        data.append(row_data)

    return data

Complete Example

Here's a complete example that combines all the concepts:

from lxml import html
import requests
import pandas as pd

def scrape_tables_from_url(url):
    """Complete table scraping function"""
    try:
        response = requests.get(url)
        response.raise_for_status()

        tree = html.fromstring(response.content)
        tables = tree.xpath('//table')

        extracted_tables = []

        for i, table in enumerate(tables):
            try:
                headers, data = extract_table_data(table)
                df = pd.DataFrame(data)

                table_info = {
                    'table_index': i,
                    'headers': headers,
                    'data': data,
                    'dataframe': df,
                    'shape': df.shape
                }
                extracted_tables.append(table_info)

            except Exception as e:
                print(f"Error processing table {i}: {e}")

        return extracted_tables

    except requests.RequestException as e:
        print(f"Error fetching URL: {e}")
        return []

# Usage
url = "https://example.com/page-with-tables"
tables = scrape_tables_from_url(url)

for table_info in tables:
    print(f"Table {table_info['table_index']} shape: {table_info['shape']}")
    print(table_info['dataframe'].head())
    print()

Best Practices

Error Handling: Always wrap table extraction in try-except blocks
Data Validation: Check for empty cells and handle missing data appropriately
Performance: For large tables, consider processing data in chunks
Memory Management: Use iterparse for very large HTML documents
Robustness: Handle variations in table structure gracefully

Legal and Ethical Considerations

Always ensure you have permission to scrape the target website. Check the robots.txt file and terms of service. Consider rate limiting your requests to avoid overwhelming the server, and respect the website's scraping policies.

Table of contents