How do I use Html Agility Pack to clean up HTML content?

Html Agility Pack (HAP) is a powerful .NET library for parsing and manipulating HTML documents. While primarily designed for HTML parsing, it excels at cleaning and sanitizing HTML content by removing unwanted elements, fixing malformed markup, and standardizing output.

Installation

Install Html Agility Pack via NuGet Package Manager:

Install-Package HtmlAgilityPack

Or via .NET CLI:

dotnet add package HtmlAgilityPack

Basic HTML Cleaning Setup

using HtmlAgilityPack;
using System;
using System.IO;
using System.Linq;

public class HtmlCleaner
{
    private readonly HtmlDocument _document;

    public HtmlCleaner(string html)
    {
        _document = new HtmlDocument();

        // Configure parsing options
        _document.OptionFixNestedTags = true;
        _document.OptionAutoCloseOnEnd = true;
        _document.OptionWriteEmptyNodes = true;

        _document.LoadHtml(html);
    }

    public string GetCleanedHtml()
    {
        return _document.DocumentNode.OuterHtml;
    }
}

Common HTML Cleaning Operations

1. Remove Unwanted Tags

Remove potentially harmful or unwanted elements:

public void RemoveUnwantedTags()
{
    // Remove script and style tags
    var unwantedTags = new[] { "script", "style", "meta", "link", "title" };

    foreach (var tagName in unwantedTags)
    {
        var nodes = _document.DocumentNode
            .Descendants(tagName)
            .ToList();

        foreach (var node in nodes)
        {
            node.Remove();
        }
    }
}

// Alternative LINQ approach
public void RemoveUnwantedTagsLinq()
{
    _document.DocumentNode.Descendants()
        .Where(n => new[] { "script", "style", "meta", "link" }.Contains(n.Name))
        .ToList()
        .ForEach(n => n.Remove());
}

2. Remove Attributes

Clean up HTML by removing specific attributes:

public void RemoveAttributes()
{
    // Remove all style attributes
    var nodesWithStyle = _document.DocumentNode
        .Descendants()
        .Where(n => n.Attributes["style"] != null)
        .ToList();

    foreach (var node in nodesWithStyle)
    {
        node.Attributes["style"].Remove();
    }

    // Remove onclick and other event handlers
    var eventAttributes = new[] { "onclick", "onload", "onmouseover", "onfocus" };

    foreach (var attr in eventAttributes)
    {
        var nodesWithEvent = _document.DocumentNode
            .Descendants()
            .Where(n => n.Attributes[attr] != null)
            .ToList();

        foreach (var node in nodesWithEvent)
        {
            node.Attributes[attr].Remove();
        }
    }
}

3. Sanitize Content

Remove potentially dangerous content while preserving structure:

public void SanitizeContent()
{
    // Allow only specific tags
    var allowedTags = new[] { "p", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6", 
                             "strong", "em", "ul", "ol", "li", "a", "img", "br" };

    var allNodes = _document.DocumentNode.Descendants().ToList();

    foreach (var node in allNodes)
    {
        if (node.NodeType == HtmlNodeType.Element && 
            !allowedTags.Contains(node.Name.ToLower()))
        {
            // Replace unwanted tags with their inner content
            if (node.ParentNode != null)
            {
                node.ParentNode.InsertBefore(
                    HtmlNode.CreateNode(node.InnerHtml), node);
                node.Remove();
            }
        }
    }
}

4. Clean Empty Elements

Remove empty tags that serve no purpose:

public void RemoveEmptyElements()
{
    bool removed;
    do
    {
        removed = false;
        var emptyNodes = _document.DocumentNode.Descendants()
            .Where(n => n.NodeType == HtmlNodeType.Element && 
                       string.IsNullOrWhiteSpace(n.InnerText) && 
                       !n.HasChildNodes &&
                       !new[] { "br", "hr", "img", "input" }.Contains(n.Name.ToLower()))
            .ToList();

        foreach (var node in emptyNodes)
        {
            node.Remove();
            removed = true;
        }
    } while (removed);
}

5. Fix Malformed HTML

Configure HAP to automatically fix common HTML issues:

public void ConfigureFixingOptions()
{
    // Fix nested tags automatically
    _document.OptionFixNestedTags = true;

    // Auto-close tags at document end
    _document.OptionAutoCloseOnEnd = true;

    // Check for syntax errors
    _document.OptionCheckSyntax = true;

    // Output empty nodes as self-closing tags
    _document.OptionWriteEmptyNodes = true;

    // Handle encoding properly
    _document.OptionOutputAsXml = false;
}

6. Decode HTML Entities

Convert HTML entities to their character equivalents:

public void DecodeHtmlEntities()
{
    var textNodes = _document.DocumentNode.Descendants()
        .Where(n => n.NodeType == HtmlNodeType.Text)
        .ToList();

    foreach (var textNode in textNodes)
    {
        textNode.InnerHtml = HtmlEntity.DeEntitize(textNode.InnerHtml);
    }
}

Complete HTML Cleaning Example

Here's a comprehensive example that combines multiple cleaning operations:

public class ComprehensiveHtmlCleaner
{
    public static string CleanHtml(string dirtyHtml)
    {
        var doc = new HtmlDocument();

        // Configure options
        doc.OptionFixNestedTags = true;
        doc.OptionAutoCloseOnEnd = true;
        doc.OptionWriteEmptyNodes = true;

        doc.LoadHtml(dirtyHtml);

        // Remove dangerous tags
        RemoveDangerousTags(doc);

        // Remove unwanted attributes
        RemoveUnwantedAttributes(doc);

        // Clean empty elements
        RemoveEmptyElements(doc);

        // Decode entities
        DecodeEntities(doc);

        return doc.DocumentNode.OuterHtml;
    }

    private static void RemoveDangerousTags(HtmlDocument doc)
    {
        var dangerousTags = new[] { "script", "style", "iframe", "object", 
                                   "embed", "form", "input", "button" };

        foreach (var tag in dangerousTags)
        {
            doc.DocumentNode.Descendants(tag)
                .ToList()
                .ForEach(n => n.Remove());
        }
    }

    private static void RemoveUnwantedAttributes(HtmlDocument doc)
    {
        var unwantedAttrs = new[] { "style", "onclick", "onload", "onerror", 
                                   "onmouseover", "onfocus", "onblur" };

        foreach (var attr in unwantedAttrs)
        {
            doc.DocumentNode.Descendants()
                .Where(n => n.Attributes[attr] != null)
                .ToList()
                .ForEach(n => n.Attributes[attr].Remove());
        }
    }

    // ... other helper methods
}

// Usage
string dirtyHtml = "<div onclick='alert(\"xss\")' style='color:red;'>" +
                   "<script>alert('malicious');</script>" +
                   "Hello <strong>World</strong>!</div>";

string cleanHtml = ComprehensiveHtmlCleaner.CleanHtml(dirtyHtml);
Console.WriteLine(cleanHtml);
// Output: <div>Hello <strong>World</strong>!</div>

Loading HTML from Different Sources

// From string
var doc = new HtmlDocument();
doc.LoadHtml(htmlString);

// From file
doc.Load(@"C:\path\to\file.html");

// From URL with HtmlWeb
var web = new HtmlWeb();
doc = web.Load("https://example.com");

// From stream
using var stream = new FileStream("file.html", FileMode.Open);
doc.Load(stream);

Error Handling and Validation

public bool ValidateAndCleanHtml(string html, out string cleanedHtml)
{
    cleanedHtml = string.Empty;

    try
    {
        var doc = new HtmlDocument();
        doc.OptionCheckSyntax = true;
        doc.LoadHtml(html);

        if (doc.ParseErrors.Any())
        {
            Console.WriteLine("HTML parsing errors found:");
            foreach (var error in doc.ParseErrors)
            {
                Console.WriteLine($"Line {error.Line}: {error.Reason}");
            }
        }

        // Perform cleaning operations
        // ... cleaning code here ...

        cleanedHtml = doc.DocumentNode.OuterHtml;
        return true;
    }
    catch (Exception ex)
    {
        Console.WriteLine($"Error cleaning HTML: {ex.Message}");
        return false;
    }
}

Best Practices

  1. Always validate input: Check for null or empty strings before processing
  2. Configure parsing options: Set appropriate options before loading HTML
  3. Handle encoding: Ensure proper character encoding for international content
  4. Test thoroughly: Validate cleaning results with various HTML inputs
  5. Performance considerations: For large documents, consider streaming or chunked processing
  6. Security first: Always sanitize user-generated HTML content

Html Agility Pack provides robust HTML cleaning capabilities while maintaining flexibility for different cleaning requirements. Combine multiple techniques based on your specific use case and security needs.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon