Html Agility Pack (HAP) is a powerful .NET library for parsing and manipulating HTML documents, making it essential for web scraping tasks. To update text within an element, you need to locate the target element and modify its text properties.
Installation
Install Html Agility Pack via NuGet Package Manager:
Install-Package HtmlAgilityPack
Or using .NET CLI:
dotnet add package HtmlAgilityPack
Basic Text Update Methods
1. Using InnerText Property
The InnerText
property sets or gets the plain text content, automatically encoding HTML entities:
using System;
using HtmlAgilityPack;
class Program
{
static void Main()
{
var html = @"<html>
<body>
<p id='greeting'>Hello World</p>
<div class='content'>Original content</div>
</body>
</html>";
// Load HTML document
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
// Update text by ID
HtmlNode paragraph = doc.DocumentNode.SelectSingleNode("//p[@id='greeting']");
if (paragraph != null)
{
paragraph.InnerText = "Hello Html Agility Pack!";
}
// Update text by class
HtmlNode contentDiv = doc.DocumentNode.SelectSingleNode("//div[@class='content']");
if (contentDiv != null)
{
contentDiv.InnerText = "Updated content with <special> characters";
}
Console.WriteLine(doc.DocumentNode.OuterHtml);
}
}
2. Using InnerHtml Property
The InnerHtml
property allows you to set HTML content directly:
// Update with HTML content
HtmlNode element = doc.DocumentNode.SelectSingleNode("//div[@class='content']");
if (element != null)
{
element.InnerHtml = "<strong>Bold text</strong> with <em>emphasis</em>";
}
Multiple Element Selection Methods
XPath Selectors
// Select by tag name
var allParagraphs = doc.DocumentNode.SelectNodes("//p");
// Select by attribute
var elementsWithClass = doc.DocumentNode.SelectNodes("//div[@class='highlight']");
// Select by text content
var specificText = doc.DocumentNode.SelectNodes("//p[text()='specific text']");
// Complex XPath queries
var nestedElements = doc.DocumentNode.SelectNodes("//div[@class='container']//p[@id]");
CSS-like Selectors
// Select by ID (alternative to XPath)
var elementById = doc.GetElementbyId("greeting");
// Select by tag name
var paragraphs = doc.DocumentNode.Descendants("p");
// Select elements with specific attributes
var elementsWithTitle = doc.DocumentNode.Descendants()
.Where(n => n.GetAttributeValue("title", "") != "");
Advanced Text Update Scenarios
Updating Multiple Elements
// Update all paragraphs
var paragraphs = doc.DocumentNode.SelectNodes("//p");
if (paragraphs != null)
{
foreach (HtmlNode p in paragraphs)
{
p.InnerText = $"Updated: {p.InnerText}";
}
}
// Update elements matching a pattern
var headings = doc.DocumentNode.SelectNodes("//h1 | //h2 | //h3");
if (headings != null)
{
for (int i = 0; i < headings.Count; i++)
{
headings[i].InnerText = $"{i + 1}. {headings[i].InnerText}";
}
}
Conditional Updates
// Update only if text matches certain criteria
var elements = doc.DocumentNode.SelectNodes("//span[@class='price']");
if (elements != null)
{
foreach (HtmlNode element in elements)
{
if (element.InnerText.Contains("$"))
{
element.InnerText = element.InnerText.Replace("$", "USD ");
}
}
}
Preserving Partial Content
// Update only part of the text while preserving structure
var container = doc.DocumentNode.SelectSingleNode("//div[@id='container']");
if (container != null)
{
// Replace text nodes while keeping child elements
foreach (HtmlTextNode textNode in container.ChildNodes.OfType<HtmlTextNode>())
{
textNode.Text = textNode.Text.Replace("old", "new");
}
}
Working with Web Content
From URL
using System.Net.Http;
using HtmlAgilityPack;
public async Task UpdateWebPageText()
{
using var client = new HttpClient();
var html = await client.GetStringAsync("https://example.com");
var doc = new HtmlDocument();
doc.LoadHtml(html);
// Update elements
var title = doc.DocumentNode.SelectSingleNode("//title");
if (title != null)
{
Console.WriteLine($"Original title: {title.InnerText}");
title.InnerText = "Modified Title";
}
// Save modified HTML
doc.Save("modified.html");
}
From File
// Load from file
var doc = new HtmlDocument();
doc.Load("input.html");
// Update content
var content = doc.DocumentNode.SelectSingleNode("//main");
if (content != null)
{
content.InnerText = "New main content";
}
// Save to file
doc.Save("output.html");
Error Handling and Best Practices
public void SafeTextUpdate(HtmlDocument doc, string selector, string newText)
{
try
{
var element = doc.DocumentNode.SelectSingleNode(selector);
if (element != null)
{
// Validate before updating
if (!string.IsNullOrWhiteSpace(newText))
{
element.InnerText = newText;
Console.WriteLine($"Updated element: {selector}");
}
else
{
Console.WriteLine("Warning: Empty text provided");
}
}
else
{
Console.WriteLine($"Element not found: {selector}");
}
}
catch (Exception ex)
{
Console.WriteLine($"Error updating element {selector}: {ex.Message}");
}
}
Key Differences: InnerText vs InnerHtml
| Property | Behavior | Use Case |
|----------|----------|----------|
| InnerText
| Encodes HTML entities, treats content as plain text | Safe text updates, preventing XSS |
| InnerHtml
| Allows HTML tags, no encoding | Rich content updates with markup |
Common Pitfalls to Avoid
- Null Reference Exceptions: Always check if elements exist before updating
- XPath Syntax Errors: Test XPath expressions with simple cases first
- Encoding Issues: Use
InnerText
for user input to prevent HTML injection - Performance: Cache frequently used selectors when processing large documents
This comprehensive approach ensures safe and effective text manipulation using Html Agility Pack in your .NET web scraping projects.