How do I extract data from a table using jsoup?

Jsoup is a powerful Java library for parsing and manipulating HTML documents. Extracting data from HTML tables is one of the most common web scraping tasks. This guide covers everything you need to know about table extraction with Jsoup.

Setup and Installation

Add Jsoup Dependency

Maven (pom.xml):

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.17.2</version>
</dependency>

Gradle (build.gradle):

implementation 'org.jsoup:jsoup:1.17.2'

Required Imports

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

Basic Table Extraction

Simple Table Parsing

public class TableExtractor {
    public static void main(String[] args) {
        String html = """
            <table id="data-table">
                <thead>
                    <tr>
                        <th>Name</th>
                        <th>Age</th>
                        <th>City</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td>John Doe</td>
                        <td>30</td>
                        <td>New York</td>
                    </tr>
                    <tr>
                        <td>Jane Smith</td>
                        <td>25</td>
                        <td>London</td>
                    </tr>
                </tbody>
            </table>
            """;

        Document doc = Jsoup.parse(html);
        Element table = doc.getElementById("data-table");

        // Extract table rows
        Elements rows = table.select("tr");

        for (int i = 0; i < rows.size(); i++) {
            Element row = rows.get(i);
            Elements cells = row.select("th, td");

            System.out.printf("Row %d: ", i);
            for (Element cell : cells) {
                System.out.print(cell.text() + " | ");
            }
            System.out.println();
        }
    }
}

Advanced Table Selection

Multiple Selection Methods

public class AdvancedTableSelection {
    public static void extractTables(Document doc) {
        // Method 1: By ID
        Element tableById = doc.getElementById("specific-table");

        // Method 2: By CSS class
        Elements tablesByClass = doc.select("table.data-table");

        // Method 3: By attribute
        Elements tablesWithRole = doc.select("table[role=grid]");

        // Method 4: First table on page
        Element firstTable = doc.select("table").first();

        // Method 5: Nth table (zero-indexed)
        Elements allTables = doc.select("table");
        if (allTables.size() > 2) {
            Element thirdTable = allTables.get(2);
        }

        // Method 6: Table containing specific text
        Elements tablesWithText = doc.select("table:contains(Total Sales)");
    }
}

Structured Data Extraction

Creating Data Models

public class Person {
    private String name;
    private int age;
    private String city;

    public Person(String name, int age, String city) {
        this.name = name;
        this.age = age;
        this.city = city;
    }

    // Getters and toString method
    public String getName() { return name; }
    public int getAge() { return age; }
    public String getCity() { return city; }

    @Override
    public String toString() {
        return String.format("Person{name='%s', age=%d, city='%s'}", name, age, city);
    }
}

public class StructuredTableExtractor {
    public static List<Person> extractPersons(String url) throws IOException {
        Document doc = Jsoup.connect(url).get();
        Element table = doc.select("table.people-data").first();

        if (table == null) {
            throw new IllegalArgumentException("Table not found");
        }

        List<Person> persons = new ArrayList<>();
        Elements dataRows = table.select("tbody tr");

        for (Element row : dataRows) {
            Elements cells = row.select("td");

            if (cells.size() >= 3) {
                String name = cells.get(0).text();
                int age = Integer.parseInt(cells.get(1).text());
                String city = cells.get(2).text();

                persons.add(new Person(name, age, city));
            }
        }

        return persons;
    }
}

Handling Complex Tables

Tables with Headers and Footers

public class ComplexTableHandler {
    public static void extractWithHeaders(Document doc) {
        Element table = doc.select("table").first();

        // Extract headers separately
        Elements headerRows = table.select("thead tr");
        List<String> headers = new ArrayList<>();

        if (!headerRows.isEmpty()) {
            Elements headerCells = headerRows.first().select("th, td");
            for (Element header : headerCells) {
                headers.add(header.text());
            }
        }

        // Extract data rows only
        Elements dataRows = table.select("tbody tr");

        for (Element row : dataRows) {
            Elements cells = row.select("td");

            for (int i = 0; i < cells.size(); i++) {
                String columnName = i < headers.size() ? headers.get(i) : "Column " + i;
                String cellValue = cells.get(i).text();
                System.out.printf("%s: %s | ", columnName, cellValue);
            }
            System.out.println();
        }
    }
}

Extracting Cell Attributes and Links

public class AttributeExtraction {
    public static void extractCellDetails(Document doc) {
        Element table = doc.select("table").first();
        Elements rows = table.select("tr");

        for (Element row : rows) {
            Elements cells = row.select("td, th");

            for (Element cell : cells) {
                // Text content
                String text = cell.text();

                // HTML content
                String html = cell.html();

                // Specific attributes
                String cssClass = cell.attr("class");
                String colspan = cell.attr("colspan");
                String rowspan = cell.attr("rowspan");

                // Extract links within cells
                Elements links = cell.select("a");
                for (Element link : links) {
                    String linkText = link.text();
                    String href = link.attr("href");
                    System.out.printf("Link: %s -> %s%n", linkText, href);
                }

                // Extract images
                Elements images = cell.select("img");
                for (Element img : images) {
                    String src = img.attr("src");
                    String alt = img.attr("alt");
                    System.out.printf("Image: %s (alt: %s)%n", src, alt);
                }

                System.out.printf("Cell: %s (class: %s, colspan: %s)%n", 
                    text, cssClass, colspan);
            }
        }
    }
}

Error Handling and Best Practices

Robust Table Extraction

public class RobustTableExtractor {
    public static void safeExtraction(String url) {
        try {
            Document doc = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (compatible; TableExtractor/1.0)")
                .timeout(10000)
                .get();

            Elements tables = doc.select("table");

            if (tables.isEmpty()) {
                System.out.println("No tables found on the page");
                return;
            }

            for (int tableIndex = 0; tableIndex < tables.size(); tableIndex++) {
                Element table = tables.get(tableIndex);
                System.out.printf("Processing table %d%n", tableIndex + 1);

                Elements rows = table.select("tr");

                if (rows.isEmpty()) {
                    System.out.println("Table has no rows");
                    continue;
                }

                for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
                    Element row = rows.get(rowIndex);
                    Elements cells = row.select("th, td");

                    System.out.printf("Row %d: ", rowIndex + 1);
                    for (Element cell : cells) {
                        String cellText = cell.text().trim();
                        System.out.printf("[%s] ", 
                            cellText.isEmpty() ? "EMPTY" : cellText);
                    }
                    System.out.println();
                }
                System.out.println("---");
            }

        } catch (IOException e) {
            System.err.println("Failed to fetch page: " + e.getMessage());
        } catch (NumberFormatException e) {
            System.err.println("Failed to parse numeric data: " + e.getMessage());
        } catch (Exception e) {
            System.err.println("Unexpected error: " + e.getMessage());
        }
    }
}

Performance Tips

Use specific selectors: table#myTable tr is faster than tr
Cache parsed documents: Don't re-parse the same HTML multiple times
Handle large tables: Process rows incrementally for memory efficiency
Set connection timeouts: Prevent hanging on slow websites
Validate data: Always check for null values and empty cells

Common Selector Patterns

// Header rows only
Elements headers = table.select("thead tr, tr:first-child");

// Data rows excluding headers
Elements dataRows = table.select("tbody tr, tr:not(:first-child)");

// Specific columns (0-indexed)
Elements firstColumn = table.select("tr td:nth-child(1)");
Elements lastColumn = table.select("tr td:last-child");

// Rows with specific content
Elements rowsWithTotal = table.select("tr:contains(Total)");

// Cells with specific classes
Elements currencyCells = table.select("td.currency, th.currency");

This comprehensive approach allows you to handle virtually any table structure you encounter while web scraping with Jsoup.

Table of contents