Yes, Html Agility Pack is excellent for extracting data from HTML tables. This .NET library provides powerful DOM navigation capabilities and can handle both well-formed and malformed HTML documents using XPath and CSS selectors.
Installation
Install Html Agility Pack via NuGet Package Manager:
Install-Package HtmlAgilityPack
Or using the .NET CLI:
dotnet add package HtmlAgilityPack
Basic Table Extraction
Here's a comprehensive example that demonstrates table data extraction:
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
public class TableExtractor
{
public static void Main(string[] args)
{
string htmlContent = @"
<html>
<body>
<table id='employeeTable' class='data-table'>
<thead>
<tr>
<th>Name</th>
<th>Department</th>
<th>Salary</th>
</tr>
</thead>
<tbody>
<tr>
<td>John Doe</td>
<td>Engineering</td>
<td>$75,000</td>
</tr>
<tr>
<td>Jane Smith</td>
<td>Marketing</td>
<td>$65,000</td>
</tr>
</tbody>
</table>
</body>
</html>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlContent);
// Method 1: Basic extraction with XPath
ExtractBasicTable(htmlDoc);
// Method 2: Structured data extraction
ExtractStructuredTable(htmlDoc);
}
static void ExtractBasicTable(HtmlDocument doc)
{
Console.WriteLine("=== Basic Table Extraction ===");
var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");
if (table != null)
{
var rows = table.SelectNodes(".//tr");
foreach (var row in rows)
{
var cells = row.SelectNodes("td|th");
if (cells != null)
{
var rowData = cells.Select(cell => cell.InnerText.Trim());
Console.WriteLine(string.Join(" | ", rowData));
}
}
}
}
static void ExtractStructuredTable(HtmlDocument doc)
{
Console.WriteLine("\n=== Structured Table Extraction ===");
var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");
if (table != null)
{
// Extract headers
var headerRow = table.SelectSingleNode(".//thead/tr") ??
table.SelectSingleNode(".//tr[1]");
var headers = headerRow?.SelectNodes("th|td")
?.Select(h => h.InnerText.Trim())
.ToList() ?? new List<string>();
Console.WriteLine($"Headers: {string.Join(", ", headers)}");
// Extract data rows
var dataRows = table.SelectNodes(".//tbody/tr") ??
table.SelectNodes(".//tr[position()>1]");
if (dataRows != null)
{
foreach (var row in dataRows)
{
var cells = row.SelectNodes("td");
if (cells != null)
{
for (int i = 0; i < Math.Min(headers.Count, cells.Count); i++)
{
Console.WriteLine($"{headers[i]}: {cells[i].InnerText.Trim()}");
}
Console.WriteLine("---");
}
}
}
}
}
}
Advanced Table Extraction Techniques
Converting to Data Objects
using System.Collections.Generic;
public class Employee
{
public string Name { get; set; }
public string Department { get; set; }
public string Salary { get; set; }
}
public static List<Employee> ExtractEmployees(HtmlDocument doc)
{
var employees = new List<Employee>();
var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");
if (table != null)
{
var dataRows = table.SelectNodes(".//tbody/tr") ??
table.SelectNodes(".//tr[position()>1]");
foreach (var row in dataRows ?? Enumerable.Empty<HtmlNode>())
{
var cells = row.SelectNodes("td");
if (cells?.Count >= 3)
{
employees.Add(new Employee
{
Name = cells[0].InnerText.Trim(),
Department = cells[1].InnerText.Trim(),
Salary = cells[2].InnerText.Trim()
});
}
}
}
return employees;
}
Handling Complex Tables
public static void ExtractComplexTable(HtmlDocument doc)
{
// Handle tables with colspan/rowspan
var table = doc.DocumentNode.SelectSingleNode("//table");
if (table != null)
{
var rows = table.SelectNodes(".//tr");
foreach (var row in rows ?? Enumerable.Empty<HtmlNode>())
{
var cells = row.SelectNodes("td|th");
if (cells != null)
{
foreach (var cell in cells)
{
var colspan = cell.GetAttributeValue("colspan", 1);
var rowspan = cell.GetAttributeValue("rowspan", 1);
var cellText = cell.InnerText.Trim();
Console.WriteLine($"Cell: {cellText} (colspan: {colspan}, rowspan: {rowspan})");
}
}
}
}
}
Alternative Selection Methods
Using CSS Selectors (with CsQuery)
// Alternative approach using CSS selectors
var tables = doc.DocumentNode.SelectNodes("table.data-table");
var specificCells = doc.DocumentNode.SelectNodes("//table//td[contains(@class, 'highlight')]");
var nthColumn = doc.DocumentNode.SelectNodes("//table//tr/td[3]"); // 3rd column
Loading from Web
public static async Task<HtmlDocument> LoadTableFromWeb(string url)
{
using var httpClient = new HttpClient();
var html = await httpClient.GetStringAsync(url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
return doc;
}
Error Handling and Best Practices
public static void SafeTableExtraction(HtmlDocument doc)
{
try
{
var table = doc.DocumentNode.SelectSingleNode("//table[@id='myTable']");
if (table == null)
{
Console.WriteLine("Table not found");
return;
}
var rows = table.SelectNodes(".//tr");
if (rows == null || !rows.Any())
{
Console.WriteLine("No rows found in table");
return;
}
foreach (var row in rows)
{
var cells = row.SelectNodes("td|th");
if (cells != null)
{
var cellTexts = cells.Select(cell =>
string.IsNullOrWhiteSpace(cell.InnerText) ?
"[empty]" :
cell.InnerText.Trim());
Console.WriteLine(string.Join(" | ", cellTexts));
}
}
}
catch (Exception ex)
{
Console.WriteLine($"Error extracting table data: {ex.Message}");
}
}
Key Benefits
- Robust HTML parsing: Handles malformed HTML gracefully
- Flexible selectors: XPath and CSS selector support
- Memory efficient: Processes large documents without loading entire DOM
- No browser dependency: Pure .NET library, no need for browser automation
- Thread-safe: Can be used in multi-threaded applications
Best Practices
- Always check for null values when selecting nodes
- Use specific selectors to avoid selecting unintended elements
- Handle empty cells and missing data gracefully
- Trim whitespace from extracted text
- Respect robots.txt and website terms of service
- Implement rate limiting to avoid overwhelming target servers