Jsoup is a powerful Java library for parsing and manipulating HTML documents. Extracting data from HTML tables is one of the most common web scraping tasks. This guide covers everything you need to know about table extraction with Jsoup.
Setup and Installation
Add Jsoup Dependency
Maven (pom.xml
):
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
Gradle (build.gradle
):
implementation 'org.jsoup:jsoup:1.17.2'
Required Imports
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Basic Table Extraction
Simple Table Parsing
public class TableExtractor {
public static void main(String[] args) {
String html = """
<table id="data-table">
<thead>
<tr>
<th>Name</th>
<th>Age</th>
<th>City</th>
</tr>
</thead>
<tbody>
<tr>
<td>John Doe</td>
<td>30</td>
<td>New York</td>
</tr>
<tr>
<td>Jane Smith</td>
<td>25</td>
<td>London</td>
</tr>
</tbody>
</table>
""";
Document doc = Jsoup.parse(html);
Element table = doc.getElementById("data-table");
// Extract table rows
Elements rows = table.select("tr");
for (int i = 0; i < rows.size(); i++) {
Element row = rows.get(i);
Elements cells = row.select("th, td");
System.out.printf("Row %d: ", i);
for (Element cell : cells) {
System.out.print(cell.text() + " | ");
}
System.out.println();
}
}
}
Advanced Table Selection
Multiple Selection Methods
public class AdvancedTableSelection {
public static void extractTables(Document doc) {
// Method 1: By ID
Element tableById = doc.getElementById("specific-table");
// Method 2: By CSS class
Elements tablesByClass = doc.select("table.data-table");
// Method 3: By attribute
Elements tablesWithRole = doc.select("table[role=grid]");
// Method 4: First table on page
Element firstTable = doc.select("table").first();
// Method 5: Nth table (zero-indexed)
Elements allTables = doc.select("table");
if (allTables.size() > 2) {
Element thirdTable = allTables.get(2);
}
// Method 6: Table containing specific text
Elements tablesWithText = doc.select("table:contains(Total Sales)");
}
}
Structured Data Extraction
Creating Data Models
public class Person {
private String name;
private int age;
private String city;
public Person(String name, int age, String city) {
this.name = name;
this.age = age;
this.city = city;
}
// Getters and toString method
public String getName() { return name; }
public int getAge() { return age; }
public String getCity() { return city; }
@Override
public String toString() {
return String.format("Person{name='%s', age=%d, city='%s'}", name, age, city);
}
}
public class StructuredTableExtractor {
public static List<Person> extractPersons(String url) throws IOException {
Document doc = Jsoup.connect(url).get();
Element table = doc.select("table.people-data").first();
if (table == null) {
throw new IllegalArgumentException("Table not found");
}
List<Person> persons = new ArrayList<>();
Elements dataRows = table.select("tbody tr");
for (Element row : dataRows) {
Elements cells = row.select("td");
if (cells.size() >= 3) {
String name = cells.get(0).text();
int age = Integer.parseInt(cells.get(1).text());
String city = cells.get(2).text();
persons.add(new Person(name, age, city));
}
}
return persons;
}
}
Handling Complex Tables
Tables with Headers and Footers
public class ComplexTableHandler {
public static void extractWithHeaders(Document doc) {
Element table = doc.select("table").first();
// Extract headers separately
Elements headerRows = table.select("thead tr");
List<String> headers = new ArrayList<>();
if (!headerRows.isEmpty()) {
Elements headerCells = headerRows.first().select("th, td");
for (Element header : headerCells) {
headers.add(header.text());
}
}
// Extract data rows only
Elements dataRows = table.select("tbody tr");
for (Element row : dataRows) {
Elements cells = row.select("td");
for (int i = 0; i < cells.size(); i++) {
String columnName = i < headers.size() ? headers.get(i) : "Column " + i;
String cellValue = cells.get(i).text();
System.out.printf("%s: %s | ", columnName, cellValue);
}
System.out.println();
}
}
}
Extracting Cell Attributes and Links
public class AttributeExtraction {
public static void extractCellDetails(Document doc) {
Element table = doc.select("table").first();
Elements rows = table.select("tr");
for (Element row : rows) {
Elements cells = row.select("td, th");
for (Element cell : cells) {
// Text content
String text = cell.text();
// HTML content
String html = cell.html();
// Specific attributes
String cssClass = cell.attr("class");
String colspan = cell.attr("colspan");
String rowspan = cell.attr("rowspan");
// Extract links within cells
Elements links = cell.select("a");
for (Element link : links) {
String linkText = link.text();
String href = link.attr("href");
System.out.printf("Link: %s -> %s%n", linkText, href);
}
// Extract images
Elements images = cell.select("img");
for (Element img : images) {
String src = img.attr("src");
String alt = img.attr("alt");
System.out.printf("Image: %s (alt: %s)%n", src, alt);
}
System.out.printf("Cell: %s (class: %s, colspan: %s)%n",
text, cssClass, colspan);
}
}
}
}
Error Handling and Best Practices
Robust Table Extraction
public class RobustTableExtractor {
public static void safeExtraction(String url) {
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; TableExtractor/1.0)")
.timeout(10000)
.get();
Elements tables = doc.select("table");
if (tables.isEmpty()) {
System.out.println("No tables found on the page");
return;
}
for (int tableIndex = 0; tableIndex < tables.size(); tableIndex++) {
Element table = tables.get(tableIndex);
System.out.printf("Processing table %d%n", tableIndex + 1);
Elements rows = table.select("tr");
if (rows.isEmpty()) {
System.out.println("Table has no rows");
continue;
}
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
Element row = rows.get(rowIndex);
Elements cells = row.select("th, td");
System.out.printf("Row %d: ", rowIndex + 1);
for (Element cell : cells) {
String cellText = cell.text().trim();
System.out.printf("[%s] ",
cellText.isEmpty() ? "EMPTY" : cellText);
}
System.out.println();
}
System.out.println("---");
}
} catch (IOException e) {
System.err.println("Failed to fetch page: " + e.getMessage());
} catch (NumberFormatException e) {
System.err.println("Failed to parse numeric data: " + e.getMessage());
} catch (Exception e) {
System.err.println("Unexpected error: " + e.getMessage());
}
}
}
Performance Tips
- Use specific selectors:
table#myTable tr
is faster thantr
- Cache parsed documents: Don't re-parse the same HTML multiple times
- Handle large tables: Process rows incrementally for memory efficiency
- Set connection timeouts: Prevent hanging on slow websites
- Validate data: Always check for null values and empty cells
Common Selector Patterns
// Header rows only
Elements headers = table.select("thead tr, tr:first-child");
// Data rows excluding headers
Elements dataRows = table.select("tbody tr, tr:not(:first-child)");
// Specific columns (0-indexed)
Elements firstColumn = table.select("tr td:nth-child(1)");
Elements lastColumn = table.select("tr td:last-child");
// Rows with specific content
Elements rowsWithTotal = table.select("tr:contains(Total)");
// Cells with specific classes
Elements currencyCells = table.select("td.currency, th.currency");
This comprehensive approach allows you to handle virtually any table structure you encounter while web scraping with Jsoup.