Table of contents

How do I extract data from a table using jsoup?

Jsoup is a powerful Java library for parsing and manipulating HTML documents. Extracting data from HTML tables is one of the most common web scraping tasks. This guide covers everything you need to know about table extraction with Jsoup.

Setup and Installation

Add Jsoup Dependency

Maven (pom.xml):

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.17.2</version>
</dependency>

Gradle (build.gradle):

implementation 'org.jsoup:jsoup:1.17.2'

Required Imports

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

Basic Table Extraction

Simple Table Parsing

public class TableExtractor {
    public static void main(String[] args) {
        String html = """
            <table id="data-table">
                <thead>
                    <tr>
                        <th>Name</th>
                        <th>Age</th>
                        <th>City</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td>John Doe</td>
                        <td>30</td>
                        <td>New York</td>
                    </tr>
                    <tr>
                        <td>Jane Smith</td>
                        <td>25</td>
                        <td>London</td>
                    </tr>
                </tbody>
            </table>
            """;

        Document doc = Jsoup.parse(html);
        Element table = doc.getElementById("data-table");

        // Extract table rows
        Elements rows = table.select("tr");

        for (int i = 0; i < rows.size(); i++) {
            Element row = rows.get(i);
            Elements cells = row.select("th, td");

            System.out.printf("Row %d: ", i);
            for (Element cell : cells) {
                System.out.print(cell.text() + " | ");
            }
            System.out.println();
        }
    }
}

Advanced Table Selection

Multiple Selection Methods

public class AdvancedTableSelection {
    public static void extractTables(Document doc) {
        // Method 1: By ID
        Element tableById = doc.getElementById("specific-table");

        // Method 2: By CSS class
        Elements tablesByClass = doc.select("table.data-table");

        // Method 3: By attribute
        Elements tablesWithRole = doc.select("table[role=grid]");

        // Method 4: First table on page
        Element firstTable = doc.select("table").first();

        // Method 5: Nth table (zero-indexed)
        Elements allTables = doc.select("table");
        if (allTables.size() > 2) {
            Element thirdTable = allTables.get(2);
        }

        // Method 6: Table containing specific text
        Elements tablesWithText = doc.select("table:contains(Total Sales)");
    }
}

Structured Data Extraction

Creating Data Models

public class Person {
    private String name;
    private int age;
    private String city;

    public Person(String name, int age, String city) {
        this.name = name;
        this.age = age;
        this.city = city;
    }

    // Getters and toString method
    public String getName() { return name; }
    public int getAge() { return age; }
    public String getCity() { return city; }

    @Override
    public String toString() {
        return String.format("Person{name='%s', age=%d, city='%s'}", name, age, city);
    }
}

public class StructuredTableExtractor {
    public static List<Person> extractPersons(String url) throws IOException {
        Document doc = Jsoup.connect(url).get();
        Element table = doc.select("table.people-data").first();

        if (table == null) {
            throw new IllegalArgumentException("Table not found");
        }

        List<Person> persons = new ArrayList<>();
        Elements dataRows = table.select("tbody tr");

        for (Element row : dataRows) {
            Elements cells = row.select("td");

            if (cells.size() >= 3) {
                String name = cells.get(0).text();
                int age = Integer.parseInt(cells.get(1).text());
                String city = cells.get(2).text();

                persons.add(new Person(name, age, city));
            }
        }

        return persons;
    }
}

Handling Complex Tables

Tables with Headers and Footers

public class ComplexTableHandler {
    public static void extractWithHeaders(Document doc) {
        Element table = doc.select("table").first();

        // Extract headers separately
        Elements headerRows = table.select("thead tr");
        List<String> headers = new ArrayList<>();

        if (!headerRows.isEmpty()) {
            Elements headerCells = headerRows.first().select("th, td");
            for (Element header : headerCells) {
                headers.add(header.text());
            }
        }

        // Extract data rows only
        Elements dataRows = table.select("tbody tr");

        for (Element row : dataRows) {
            Elements cells = row.select("td");

            for (int i = 0; i < cells.size(); i++) {
                String columnName = i < headers.size() ? headers.get(i) : "Column " + i;
                String cellValue = cells.get(i).text();
                System.out.printf("%s: %s | ", columnName, cellValue);
            }
            System.out.println();
        }
    }
}

Extracting Cell Attributes and Links

public class AttributeExtraction {
    public static void extractCellDetails(Document doc) {
        Element table = doc.select("table").first();
        Elements rows = table.select("tr");

        for (Element row : rows) {
            Elements cells = row.select("td, th");

            for (Element cell : cells) {
                // Text content
                String text = cell.text();

                // HTML content
                String html = cell.html();

                // Specific attributes
                String cssClass = cell.attr("class");
                String colspan = cell.attr("colspan");
                String rowspan = cell.attr("rowspan");

                // Extract links within cells
                Elements links = cell.select("a");
                for (Element link : links) {
                    String linkText = link.text();
                    String href = link.attr("href");
                    System.out.printf("Link: %s -> %s%n", linkText, href);
                }

                // Extract images
                Elements images = cell.select("img");
                for (Element img : images) {
                    String src = img.attr("src");
                    String alt = img.attr("alt");
                    System.out.printf("Image: %s (alt: %s)%n", src, alt);
                }

                System.out.printf("Cell: %s (class: %s, colspan: %s)%n", 
                    text, cssClass, colspan);
            }
        }
    }
}

Error Handling and Best Practices

Robust Table Extraction

public class RobustTableExtractor {
    public static void safeExtraction(String url) {
        try {
            Document doc = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (compatible; TableExtractor/1.0)")
                .timeout(10000)
                .get();

            Elements tables = doc.select("table");

            if (tables.isEmpty()) {
                System.out.println("No tables found on the page");
                return;
            }

            for (int tableIndex = 0; tableIndex < tables.size(); tableIndex++) {
                Element table = tables.get(tableIndex);
                System.out.printf("Processing table %d%n", tableIndex + 1);

                Elements rows = table.select("tr");

                if (rows.isEmpty()) {
                    System.out.println("Table has no rows");
                    continue;
                }

                for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
                    Element row = rows.get(rowIndex);
                    Elements cells = row.select("th, td");

                    System.out.printf("Row %d: ", rowIndex + 1);
                    for (Element cell : cells) {
                        String cellText = cell.text().trim();
                        System.out.printf("[%s] ", 
                            cellText.isEmpty() ? "EMPTY" : cellText);
                    }
                    System.out.println();
                }
                System.out.println("---");
            }

        } catch (IOException e) {
            System.err.println("Failed to fetch page: " + e.getMessage());
        } catch (NumberFormatException e) {
            System.err.println("Failed to parse numeric data: " + e.getMessage());
        } catch (Exception e) {
            System.err.println("Unexpected error: " + e.getMessage());
        }
    }
}

Performance Tips

  1. Use specific selectors: table#myTable tr is faster than tr
  2. Cache parsed documents: Don't re-parse the same HTML multiple times
  3. Handle large tables: Process rows incrementally for memory efficiency
  4. Set connection timeouts: Prevent hanging on slow websites
  5. Validate data: Always check for null values and empty cells

Common Selector Patterns

// Header rows only
Elements headers = table.select("thead tr, tr:first-child");

// Data rows excluding headers
Elements dataRows = table.select("tbody tr, tr:not(:first-child)");

// Specific columns (0-indexed)
Elements firstColumn = table.select("tr td:nth-child(1)");
Elements lastColumn = table.select("tr td:last-child");

// Rows with specific content
Elements rowsWithTotal = table.select("tr:contains(Total)");

// Cells with specific classes
Elements currencyCells = table.select("td.currency, th.currency");

This comprehensive approach allows you to handle virtually any table structure you encounter while web scraping with Jsoup.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon