Can I use Html Agility Pack to extract data from tables in an HTML document?

Yes, Html Agility Pack is excellent for extracting data from HTML tables. This .NET library provides powerful DOM navigation capabilities and can handle both well-formed and malformed HTML documents using XPath and CSS selectors.

Installation

Install Html Agility Pack via NuGet Package Manager:

Install-Package HtmlAgilityPack

Or using the .NET CLI:

dotnet add package HtmlAgilityPack

Basic Table Extraction

Here's a comprehensive example that demonstrates table data extraction:

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;

public class TableExtractor
{
    public static void Main(string[] args)
    {
        string htmlContent = @"
        <html>
            <body>
                <table id='employeeTable' class='data-table'>
                    <thead>
                        <tr>
                            <th>Name</th>
                            <th>Department</th>
                            <th>Salary</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td>John Doe</td>
                            <td>Engineering</td>
                            <td>$75,000</td>
                        </tr>
                        <tr>
                            <td>Jane Smith</td>
                            <td>Marketing</td>
                            <td>$65,000</td>
                        </tr>
                    </tbody>
                </table>
            </body>
        </html>";

        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(htmlContent);

        // Method 1: Basic extraction with XPath
        ExtractBasicTable(htmlDoc);

        // Method 2: Structured data extraction
        ExtractStructuredTable(htmlDoc);
    }

    static void ExtractBasicTable(HtmlDocument doc)
    {
        Console.WriteLine("=== Basic Table Extraction ===");

        var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");

        if (table != null)
        {
            var rows = table.SelectNodes(".//tr");

            foreach (var row in rows)
            {
                var cells = row.SelectNodes("td|th");
                if (cells != null)
                {
                    var rowData = cells.Select(cell => cell.InnerText.Trim());
                    Console.WriteLine(string.Join(" | ", rowData));
                }
            }
        }
    }

    static void ExtractStructuredTable(HtmlDocument doc)
    {
        Console.WriteLine("\n=== Structured Table Extraction ===");

        var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");

        if (table != null)
        {
            // Extract headers
            var headerRow = table.SelectSingleNode(".//thead/tr") ?? 
                           table.SelectSingleNode(".//tr[1]");
            var headers = headerRow?.SelectNodes("th|td")
                ?.Select(h => h.InnerText.Trim())
                .ToList() ?? new List<string>();

            Console.WriteLine($"Headers: {string.Join(", ", headers)}");

            // Extract data rows
            var dataRows = table.SelectNodes(".//tbody/tr") ?? 
                          table.SelectNodes(".//tr[position()>1]");

            if (dataRows != null)
            {
                foreach (var row in dataRows)
                {
                    var cells = row.SelectNodes("td");
                    if (cells != null)
                    {
                        for (int i = 0; i < Math.Min(headers.Count, cells.Count); i++)
                        {
                            Console.WriteLine($"{headers[i]}: {cells[i].InnerText.Trim()}");
                        }
                        Console.WriteLine("---");
                    }
                }
            }
        }
    }
}

Advanced Table Extraction Techniques

Converting to Data Objects

using System.Collections.Generic;

public class Employee
{
    public string Name { get; set; }
    public string Department { get; set; }
    public string Salary { get; set; }
}

public static List<Employee> ExtractEmployees(HtmlDocument doc)
{
    var employees = new List<Employee>();
    var table = doc.DocumentNode.SelectSingleNode("//table[@id='employeeTable']");

    if (table != null)
    {
        var dataRows = table.SelectNodes(".//tbody/tr") ?? 
                      table.SelectNodes(".//tr[position()>1]");

        foreach (var row in dataRows ?? Enumerable.Empty<HtmlNode>())
        {
            var cells = row.SelectNodes("td");
            if (cells?.Count >= 3)
            {
                employees.Add(new Employee
                {
                    Name = cells[0].InnerText.Trim(),
                    Department = cells[1].InnerText.Trim(),
                    Salary = cells[2].InnerText.Trim()
                });
            }
        }
    }

    return employees;
}

Handling Complex Tables

public static void ExtractComplexTable(HtmlDocument doc)
{
    // Handle tables with colspan/rowspan
    var table = doc.DocumentNode.SelectSingleNode("//table");

    if (table != null)
    {
        var rows = table.SelectNodes(".//tr");

        foreach (var row in rows ?? Enumerable.Empty<HtmlNode>())
        {
            var cells = row.SelectNodes("td|th");

            if (cells != null)
            {
                foreach (var cell in cells)
                {
                    var colspan = cell.GetAttributeValue("colspan", 1);
                    var rowspan = cell.GetAttributeValue("rowspan", 1);
                    var cellText = cell.InnerText.Trim();

                    Console.WriteLine($"Cell: {cellText} (colspan: {colspan}, rowspan: {rowspan})");
                }
            }
        }
    }
}

Alternative Selection Methods

Using CSS Selectors (with CsQuery)

// Alternative approach using CSS selectors
var tables = doc.DocumentNode.SelectNodes("table.data-table");
var specificCells = doc.DocumentNode.SelectNodes("//table//td[contains(@class, 'highlight')]");
var nthColumn = doc.DocumentNode.SelectNodes("//table//tr/td[3]"); // 3rd column

Loading from Web

public static async Task<HtmlDocument> LoadTableFromWeb(string url)
{
    using var httpClient = new HttpClient();
    var html = await httpClient.GetStringAsync(url);

    var doc = new HtmlDocument();
    doc.LoadHtml(html);

    return doc;
}

Error Handling and Best Practices

public static void SafeTableExtraction(HtmlDocument doc)
{
    try
    {
        var table = doc.DocumentNode.SelectSingleNode("//table[@id='myTable']");

        if (table == null)
        {
            Console.WriteLine("Table not found");
            return;
        }

        var rows = table.SelectNodes(".//tr");

        if (rows == null || !rows.Any())
        {
            Console.WriteLine("No rows found in table");
            return;
        }

        foreach (var row in rows)
        {
            var cells = row.SelectNodes("td|th");

            if (cells != null)
            {
                var cellTexts = cells.Select(cell => 
                    string.IsNullOrWhiteSpace(cell.InnerText) ? 
                    "[empty]" : 
                    cell.InnerText.Trim());

                Console.WriteLine(string.Join(" | ", cellTexts));
            }
        }
    }
    catch (Exception ex)
    {
        Console.WriteLine($"Error extracting table data: {ex.Message}");
    }
}

Key Benefits

Robust HTML parsing: Handles malformed HTML gracefully
Flexible selectors: XPath and CSS selector support
Memory efficient: Processes large documents without loading entire DOM
No browser dependency: Pure .NET library, no need for browser automation
Thread-safe: Can be used in multi-threaded applications

Best Practices

Always check for null values when selecting nodes
Use specific selectors to avoid selecting unintended elements
Handle empty cells and missing data gracefully
Trim whitespace from extracted text
Respect robots.txt and website terms of service
Implement rate limiting to avoid overwhelming target servers

Table of contents

Can I use Html Agility Pack to extract data from tables in an HTML document?

Installation

Basic Table Extraction

Advanced Table Extraction Techniques

Converting to Data Objects

Handling Complex Tables

Alternative Selection Methods

Using CSS Selectors (with CsQuery)

Loading from Web

Error Handling and Best Practices

Key Benefits

Best Practices

Try WebScraping.AI for Your Web Scraping Needs

Key Features:

Getting Started:

Related Questions

What are some alternatives to Html Agility Pack?

How do I update text within an element using Html Agility Pack?

How do I select elements by class or ID using Html Agility Pack?

Get Started Now

Support

Support