Can I use Html Agility Pack to extract data from tables in an HTML document?

Yes, Html Agility Pack is excellent for extracting data from HTML tables. This .NET library provides powerful DOM navigation capabilities and can handle both well-formed and malformed HTML documents using XPath and CSS selectors.

Installation

Install Html Agility Pack via NuGet Package Manager:

Install-Package HtmlAgilityPack

Or using the .NET CLI:

dotnet add package HtmlAgilityPack

Basic Table Extraction

Here's a comprehensive example that demonstrates table data extraction:

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;

public class TableExtractor
{
    public static void Main(string[] args)
    {
        string htmlContent = @"
        <html>
            <body>
                <table id='employeeTable' class='data-table'>
                    <thead>
                        <tr>
                            <th>Name</th>
                            <th>Department</th>
                            <th>Salary</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td>John Doe</td>
                            <td>Engineering</td>
                            <td>$75,000</td>
                        </tr>
                        <tr>
                            <td>Jane Smith</td>
                            <td>Marketing</td>
                            <td>$65,000</td>
                        </tr>
                    </tbody>
                </table>
            </body>
        </html>";

        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(htmlContent);

        // Method 1: Basic extraction with XPath
        ExtractBasicTable(htmlDoc);

        // Method 2: Structured data extraction
        ExtractStructuredTable(htmlDoc);
    }

    static void ExtractBasicTable(HtmlDocument doc)
    {
        Console.WriteLine("=== Basic Table Extraction ===");

        var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");

        if (table != null)
        {
            var rows = table.SelectNodes(".//tr");

            foreach (var row in rows)
            {
                var cells = row.SelectNodes("td|th");
                if (cells != null)
                {
                    var rowData = cells.Select(cell => cell.InnerText.Trim());
                    Console.WriteLine(string.Join(" | ", rowData));
                }
            }
        }
    }

    static void ExtractStructuredTable(HtmlDocument doc)
    {
        Console.WriteLine("\n=== Structured Table Extraction ===");

        var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");

        if (table != null)
        {
            // Extract headers
            var headerRow = table.SelectSingleNode(".//thead/tr") ?? 
                           table.SelectSingleNode(".//tr[1]");
            var headers = headerRow?.SelectNodes("th|td")
                ?.Select(h => h.InnerText.Trim())
                .ToList() ?? new List<string>();

            Console.WriteLine($"Headers: {string.Join(", ", headers)}");

            // Extract data rows
            var dataRows = table.SelectNodes(".//tbody/tr") ?? 
                          table.SelectNodes(".//tr[position()>1]");

            if (dataRows != null)
            {
                foreach (var row in dataRows)
                {
                    var cells = row.SelectNodes("td");
                    if (cells != null)
                    {
                        for (int i = 0; i < Math.Min(headers.Count, cells.Count); i++)
                        {
                            Console.WriteLine($"{headers[i]}: {cells[i].InnerText.Trim()}");
                        }
                        Console.WriteLine("---");
                    }
                }
            }
        }
    }
}

Advanced Table Extraction Techniques

Converting to Data Objects

using System.Collections.Generic;

public class Employee
{
    public string Name { get; set; }
    public string Department { get; set; }
    public string Salary { get; set; }
}

public static List<Employee> ExtractEmployees(HtmlDocument doc)
{
    var employees = new List<Employee>();
    var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");

    if (table != null)
    {
        var dataRows = table.SelectNodes(".//tbody/tr") ?? 
                      table.SelectNodes(".//tr[position()>1]");

        foreach (var row in dataRows ?? Enumerable.Empty<HtmlNode>())
        {
            var cells = row.SelectNodes("td");
            if (cells?.Count >= 3)
            {
                employees.Add(new Employee
                {
                    Name = cells[0].InnerText.Trim(),
                    Department = cells[1].InnerText.Trim(),
                    Salary = cells[2].InnerText.Trim()
                });
            }
        }
    }

    return employees;
}

Handling Complex Tables

public static void ExtractComplexTable(HtmlDocument doc)
{
    // Handle tables with colspan/rowspan
    var table = doc.DocumentNode.SelectSingleNode("//table");

    if (table != null)
    {
        var rows = table.SelectNodes(".//tr");

        foreach (var row in rows ?? Enumerable.Empty<HtmlNode>())
        {
            var cells = row.SelectNodes("td|th");

            if (cells != null)
            {
                foreach (var cell in cells)
                {
                    var colspan = cell.GetAttributeValue("colspan", 1);
                    var rowspan = cell.GetAttributeValue("rowspan", 1);
                    var cellText = cell.InnerText.Trim();

                    Console.WriteLine($"Cell: {cellText} (colspan: {colspan}, rowspan: {rowspan})");
                }
            }
        }
    }
}

Alternative Selection Methods

Using CSS Selectors (with CsQuery)

// Alternative approach using CSS selectors
var tables = doc.DocumentNode.SelectNodes("table.data-table");
var specificCells = doc.DocumentNode.SelectNodes("//table//td[contains(@class, 'highlight')]");
var nthColumn = doc.DocumentNode.SelectNodes("//table//tr/td[3]"); // 3rd column

Loading from Web

public static async Task<HtmlDocument> LoadTableFromWeb(string url)
{
    using var httpClient = new HttpClient();
    var html = await httpClient.GetStringAsync(url);

    var doc = new HtmlDocument();
    doc.LoadHtml(html);

    return doc;
}

Error Handling and Best Practices

public static void SafeTableExtraction(HtmlDocument doc)
{
    try
    {
        var table = doc.DocumentNode.SelectSingleNode("//table[@id='myTable']");

        if (table == null)
        {
            Console.WriteLine("Table not found");
            return;
        }

        var rows = table.SelectNodes(".//tr");

        if (rows == null || !rows.Any())
        {
            Console.WriteLine("No rows found in table");
            return;
        }

        foreach (var row in rows)
        {
            var cells = row.SelectNodes("td|th");

            if (cells != null)
            {
                var cellTexts = cells.Select(cell => 
                    string.IsNullOrWhiteSpace(cell.InnerText) ? 
                    "[empty]" : 
                    cell.InnerText.Trim());

                Console.WriteLine(string.Join(" | ", cellTexts));
            }
        }
    }
    catch (Exception ex)
    {
        Console.WriteLine($"Error extracting table data: {ex.Message}");
    }
}

Key Benefits

  • Robust HTML parsing: Handles malformed HTML gracefully
  • Flexible selectors: XPath and CSS selector support
  • Memory efficient: Processes large documents without loading entire DOM
  • No browser dependency: Pure .NET library, no need for browser automation
  • Thread-safe: Can be used in multi-threaded applications

Best Practices

  1. Always check for null values when selecting nodes
  2. Use specific selectors to avoid selecting unintended elements
  3. Handle empty cells and missing data gracefully
  4. Trim whitespace from extracted text
  5. Respect robots.txt and website terms of service
  6. Implement rate limiting to avoid overwhelming target servers

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon