How do I filter nodes by their attributes with Html Agility Pack?

Html Agility Pack provides powerful methods to filter HTML nodes based on their attributes. You can use either XPath expressions or LINQ queries to select elements that match specific attribute criteria.

Installation

First, install Html Agility Pack via NuGet:

Install-Package HtmlAgilityPack

Method 1: Using XPath Expressions

XPath is the most common approach for filtering nodes by attributes in Html Agility Pack.

Basic Attribute Filtering

using HtmlAgilityPack;
using System;

class Program
{
    static void Main()
    {
        var html = @"
            <div>
                <a href='https://example.com' title='Example Link'>Example</a>
                <a href='https://test.com' class='external'>Test</a>
                <img src='image.jpg' alt='Sample Image' class='responsive'>
                <button type='submit' disabled>Submit</button>
            </div>";

        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(html);

        // Find all elements with a specific attribute
        var linksWithTitle = htmlDoc.DocumentNode.SelectNodes("//a[@title]");

        // Find elements with specific attribute value
        var externalLinks = htmlDoc.DocumentNode.SelectNodes("//a[@class='external']");

        // Find elements with attribute containing specific text
        var exampleLinks = htmlDoc.DocumentNode.SelectNodes("//a[contains(@href, 'example')]");

        PrintResults("Links with title:", linksWithTitle);
        PrintResults("External links:", externalLinks);
        PrintResults("Example links:", exampleLinks);
    }

    static void PrintResults(string label, HtmlNodeCollection nodes)
    {
        Console.WriteLine($"{label}");
        if (nodes != null)
        {
            foreach (var node in nodes)
            {
                Console.WriteLine($"  {node.OuterHtml}");
            }
        }
        else
        {
            Console.WriteLine("  No matching nodes found.");
        }
        Console.WriteLine();
    }
}

Advanced XPath Filtering

// Multiple attribute conditions
var complexLinks = htmlDoc.DocumentNode.SelectNodes("//a[@href and @title]");

// Attribute value comparisons
var httpsLinks = htmlDoc.DocumentNode.SelectNodes("//a[starts-with(@href, 'https')]");
var pngImages = htmlDoc.DocumentNode.SelectNodes("//img[substring(@src, string-length(@src) - 3) = '.png']");

// Multiple conditions with AND/OR
var specificLinks = htmlDoc.DocumentNode.SelectNodes("//a[contains(@href, 'example') and starts-with(@title, 'Example')]");
var mediaElements = htmlDoc.DocumentNode.SelectNodes("//img[@alt] | //video[@controls]");

// Attribute existence check
var elementsWithClass = htmlDoc.DocumentNode.SelectNodes("//*[@class]");
var disabledElements = htmlDoc.DocumentNode.SelectNodes("//*[@disabled]");

Common XPath Attribute Patterns

// Exact match
var nodes = htmlDoc.DocumentNode.SelectNodes("//div[@id='main']");

// Contains text
var nodes = htmlDoc.DocumentNode.SelectNodes("//a[contains(@class, 'btn')]");

// Starts with
var nodes = htmlDoc.DocumentNode.SelectNodes("//img[starts-with(@src, 'http')]");

// Case-insensitive matching (XPath 2.0)
var nodes = htmlDoc.DocumentNode.SelectNodes("//a[contains(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'example')]");

// Multiple class names
var nodes = htmlDoc.DocumentNode.SelectNodes("//div[contains(concat(' ', @class, ' '), ' active ')]");

Method 2: Using LINQ Queries

LINQ provides a more .NET-native approach to filtering nodes:

using HtmlAgilityPack;
using System;
using System.Linq;

class Program
{
    static void Main()
    {
        var html = @"
            <div>
                <a href='https://example.com' title='Example Link' class='external'>Example</a>
                <a href='https://test.com' class='internal'>Test</a>
                <img src='image.jpg' alt='Sample Image' width='100'>
                <input type='text' name='username' required>
            </div>";

        var htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(html);

        // Basic attribute filtering
        var linksWithTitle = htmlDoc.DocumentNode
            .Descendants("a")
            .Where(a => a.Attributes["title"] != null)
            .ToList();

        // Attribute value filtering
        var externalLinks = htmlDoc.DocumentNode
            .Descendants("a")
            .Where(a => a.GetAttributeValue("class", "").Contains("external"))
            .ToList();

        // Complex conditions
        var httpsLinksWithClass = htmlDoc.DocumentNode
            .Descendants("a")
            .Where(a => a.GetAttributeValue("href", "").StartsWith("https") 
                     && !string.IsNullOrEmpty(a.GetAttributeValue("class", "")))
            .ToList();

        // Multiple element types
        var elementsWithRequiredAttr = htmlDoc.DocumentNode
            .Descendants()
            .Where(n => n.Attributes["required"] != null || n.Attributes["disabled"] != null)
            .ToList();

        Console.WriteLine($"Links with title: {linksWithTitle.Count}");
        Console.WriteLine($"External links: {externalLinks.Count}");
        Console.WriteLine($"HTTPS links with class: {httpsLinksWithClass.Count}");
        Console.WriteLine($"Required/disabled elements: {elementsWithRequiredAttr.Count}");
    }
}

Advanced LINQ Filtering

// Custom attribute validation
var validImages = htmlDoc.DocumentNode
    .Descendants("img")
    .Where(img => {
        var src = img.GetAttributeValue("src", "");
        var alt = img.GetAttributeValue("alt", "");
        return !string.IsNullOrEmpty(src) && !string.IsNullOrEmpty(alt);
    })
    .ToList();

// Filtering by multiple attributes
var accessibleLinks = htmlDoc.DocumentNode
    .Descendants("a")
    .Where(a => a.Attributes.Any(attr => 
        attr.Name == "title" || attr.Name == "aria-label"))
    .ToList();

// Numeric attribute comparisons
var largeImages = htmlDoc.DocumentNode
    .Descendants("img")
    .Where(img => {
        var width = img.GetAttributeValue("width", "0");
        return int.TryParse(width, out int w) && w > 200;
    })
    .ToList();

Helper Methods for Cleaner Code

public static class HtmlNodeExtensions
{
    public static bool HasAttribute(this HtmlNode node, string attributeName)
    {
        return node.Attributes[attributeName] != null;
    }

    public static bool HasAttributeValue(this HtmlNode node, string attributeName, string value)
    {
        var attr = node.Attributes[attributeName];
        return attr != null && attr.Value == value;
    }

    public static bool AttributeContains(this HtmlNode node, string attributeName, string searchText)
    {
        var attr = node.Attributes[attributeName];
        return attr != null && attr.Value.Contains(searchText, StringComparison.OrdinalIgnoreCase);
    }
}

// Usage
var nodesWithId = htmlDoc.DocumentNode
    .Descendants()
    .Where(n => n.HasAttribute("id"))
    .ToList();

var submitButtons = htmlDoc.DocumentNode
    .Descendants("button")
    .Where(b => b.HasAttributeValue("type", "submit"))
    .ToList();

Best Practices

  1. Null checking: Always check if SelectNodes() returns null before iterating
  2. Performance: Use specific selectors instead of broad queries when possible
  3. Case sensitivity: HTML attributes are case-insensitive, but XPath comparisons are case-sensitive
  4. Escape special characters: Use proper escaping for attribute values containing quotes
  5. Use GetAttributeValue(): This method provides default values and handles missing attributes gracefully

Common Use Cases

// Extract all external links
var externalLinks = htmlDoc.DocumentNode
    .SelectNodes("//a[starts-with(@href, 'http') and not(contains(@href, 'yourdomain.com'))]");

// Find all images without alt text (accessibility check)
var inaccessibleImages = htmlDoc.DocumentNode
    .SelectNodes("//img[not(@alt) or @alt='']");

// Get all form inputs that are required
var requiredFields = htmlDoc.DocumentNode
    .SelectNodes("//input[@required] | //textarea[@required] | //select[@required]");

// Find elements with specific CSS classes
var highlightedElements = htmlDoc.DocumentNode
    .SelectNodes("//*[contains(concat(' ', @class, ' '), ' highlight ')]");

Both XPath and LINQ approaches are powerful for filtering nodes by attributes. Choose XPath for complex queries and LINQ for better integration with .NET code and when you need additional processing logic.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon