Extracting data from HTML tables is a common requirement in web scraping. Java offers excellent libraries for this task, with Jsoup being the most popular choice due to its jQuery-like syntax and robust parsing capabilities.
Setup and Dependencies
Maven Dependency
Add Jsoup to your pom.xml
:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
Gradle Dependency
For Gradle projects, add to your build.gradle
:
implementation 'org.jsoup:jsoup:1.17.2'
Basic Table Extraction
Here's a complete example that demonstrates fetching a webpage and extracting table data:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class TableScraper {
public static void main(String[] args) {
try {
// Fetch and parse the webpage
String url = "https://example.com/data-table";
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
// Extract data from the first table
Element table = doc.selectFirst("table");
if (table != null) {
extractTableData(table);
}
} catch (IOException e) {
System.err.println("Error fetching webpage: " + e.getMessage());
}
}
private static void extractTableData(Element table) {
// Extract headers
Elements headerElements = table.select("thead tr th");
List<String> headers = new ArrayList<>();
for (Element header : headerElements) {
headers.add(header.text().trim());
}
System.out.println("Headers: " + headers);
// Extract data rows
Elements rows = table.select("tbody tr");
for (Element row : rows) {
Elements cells = row.select("td");
List<String> rowData = new ArrayList<>();
for (Element cell : cells) {
rowData.add(cell.text().trim());
}
System.out.println("Row data: " + rowData);
}
}
}
Advanced Table Selection
Selecting Specific Tables
// Select table by CSS class
Elements tablesByClass = doc.select("table.data-table");
// Select table by ID
Element tableById = doc.selectFirst("table#results");
// Select tables containing specific text
Elements tablesWithText = doc.select("table:contains(Total Sales)");
// Select nested tables
Elements nestedTables = doc.select("div.content table");
Handling Different Table Structures
public class AdvancedTableParser {
public static void parseFlexibleTable(Element table) {
Elements rows = table.select("tr");
for (int i = 0; i < rows.size(); i++) {
Element row = rows.get(i);
Elements cells = row.select("th, td"); // Include both headers and data cells
System.out.print("Row " + i + ": ");
for (Element cell : cells) {
String cellText = cell.text().trim();
String colspan = cell.attr("colspan");
String rowspan = cell.attr("rowspan");
System.out.print("[" + cellText);
if (!colspan.isEmpty()) System.out.print(" (colspan:" + colspan + ")");
if (!rowspan.isEmpty()) System.out.print(" (rowspan:" + rowspan + ")");
System.out.print("] ");
}
System.out.println();
}
}
}
Structured Data Extraction
Creating Data Objects
public class TableRow {
private List<String> cells;
private Map<String, String> namedCells;
public TableRow(List<String> headers, Elements cellElements) {
this.cells = new ArrayList<>();
this.namedCells = new HashMap<>();
for (int i = 0; i < cellElements.size(); i++) {
String cellValue = cellElements.get(i).text().trim();
cells.add(cellValue);
// Map to header names if available
if (i < headers.size()) {
namedCells.put(headers.get(i), cellValue);
}
}
}
public String getCell(int index) {
return index < cells.size() ? cells.get(index) : "";
}
public String getCell(String headerName) {
return namedCells.getOrDefault(headerName, "");
}
// Getters and toString method
}
Complete Table Parser
public class TableDataExtractor {
public static List<TableRow> extractTableRows(String url, String tableSelector) {
List<TableRow> rows = new ArrayList<>();
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; TableScraper/1.0)")
.timeout(15000)
.get();
Element table = doc.selectFirst(tableSelector);
if (table == null) {
System.err.println("Table not found with selector: " + tableSelector);
return rows;
}
// Extract headers
List<String> headers = extractHeaders(table);
// Extract data rows
Elements dataRows = table.select("tbody tr");
if (dataRows.isEmpty()) {
// Fallback: try all rows except first (assuming it's header)
Elements allRows = table.select("tr");
if (allRows.size() > 1) {
dataRows = allRows.subList(1, allRows.size());
}
}
for (Element row : dataRows) {
Elements cells = row.select("td");
if (!cells.isEmpty()) {
rows.add(new TableRow(headers, cells));
}
}
} catch (IOException e) {
System.err.println("Error extracting table data: " + e.getMessage());
}
return rows;
}
private static List<String> extractHeaders(Element table) {
List<String> headers = new ArrayList<>();
// Try multiple header selection strategies
Elements headerCells = table.select("thead tr th");
if (headerCells.isEmpty()) {
headerCells = table.select("tr:first-child th");
}
if (headerCells.isEmpty()) {
headerCells = table.select("tr:first-child td");
}
for (Element header : headerCells) {
headers.add(header.text().trim());
}
return headers;
}
}
Handling Complex Scenarios
Tables with Merged Cells
public static void handleMergedCells(Element table) {
Elements rows = table.select("tr");
for (Element row : rows) {
Elements cells = row.select("td, th");
for (Element cell : cells) {
String text = cell.text();
int colspan = parseIntAttribute(cell, "colspan", 1);
int rowspan = parseIntAttribute(cell, "rowspan", 1);
System.out.printf("Cell: '%s' (spans %d cols, %d rows)%n",
text, colspan, rowspan);
}
}
}
private static int parseIntAttribute(Element element, String attribute, int defaultValue) {
String value = element.attr(attribute);
try {
return value.isEmpty() ? defaultValue : Integer.parseInt(value);
} catch (NumberFormatException e) {
return defaultValue;
}
}
Extracting Links and Images from Tables
public static void extractTableMedia(Element table) {
Elements rows = table.select("tr");
for (Element row : rows) {
// Extract links
Elements links = row.select("a[href]");
for (Element link : links) {
System.out.println("Link: " + link.text() + " -> " + link.attr("abs:href"));
}
// Extract images
Elements images = row.select("img[src]");
for (Element img : images) {
System.out.println("Image: " + img.attr("alt") + " -> " + img.attr("abs:src"));
}
}
}
Error Handling and Best Practices
Robust Connection Setup
public static Document connectToWebpage(String url) throws IOException {
return Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(15000)
.followRedirects(true)
.maxBodySize(1024 * 1024) // 1MB limit
.get();
}
Comprehensive Error Handling
public static void scrapeTableSafely(String url, String tableSelector) {
try {
Document doc = connectToWebpage(url);
Element table = doc.selectFirst(tableSelector);
if (table == null) {
System.err.println("No table found matching selector: " + tableSelector);
return;
}
extractTableData(table);
} catch (IOException e) {
System.err.println("Network error: " + e.getMessage());
} catch (Exception e) {
System.err.println("Parsing error: " + e.getMessage());
e.printStackTrace();
}
}
Alternative Libraries
Using HtmlUnit for JavaScript-Heavy Pages
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>3.5.0</version>
</dependency>
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
public class HtmlUnitTableScraper {
public static void scrapeJavaScriptTable(String url) {
try (WebClient webClient = new WebClient()) {
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
HtmlPage page = webClient.getPage(url);
webClient.waitForBackgroundJavaScript(5000);
HtmlTable table = page.getFirstByXPath("//table[@class='data-table']");
if (table != null) {
System.out.println(table.asNormalizedText());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
Ethical Considerations
- Check robots.txt: Always verify
https://example.com/robots.txt
before scraping - Rate limiting: Add delays between requests to avoid overwhelming servers
- User-Agent: Use appropriate user-agent strings
- Terms of service: Respect website terms and legal requirements
- Data usage: Only extract necessary data and respect copyright
Performance Tips
- Use connection pooling for multiple requests
- Cache DOM parsing results when possible
- Process tables in batches for large datasets
- Consider using parallel processing for multiple tables
- Implement retry logic for network failures
This comprehensive approach ensures reliable table data extraction while following best practices for web scraping in Java.