Yes, Html Agility Pack can effectively convert HTML documents to plain text in C#. This powerful .NET library is specifically designed to parse HTML documents, including malformed or "real-world" HTML, making it an excellent choice for text extraction from web pages.
Simple Text Extraction Using InnerText
The simplest approach uses the InnerText
property, which automatically strips all HTML tags:
using HtmlAgilityPack;
using System;
class Program
{
static void Main(string[] args)
{
var html = @"
<html>
<body>
<h1>Welcome to My Homepage</h1>
<p>This is a paragraph with <a href='https://example.com'>a link</a>.</p>
<div>Some more text here in a div.</div>
</body>
</html>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
// Simple extraction - strips all HTML tags
string plainText = htmlDoc.DocumentNode.InnerText;
Console.WriteLine(plainText.Trim());
// Output: Welcome to My HomepageThis is a paragraph with a link.Some more text here in a div.
}
}
Advanced Text Extraction with Formatting
For better formatted output that preserves line breaks and spacing:
using HtmlAgilityPack;
using System;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
public static class HtmlToTextConverter
{
public static string ConvertToPlainText(HtmlDocument doc)
{
// Remove script and style elements
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
// Convert to text with basic formatting
string text = doc.DocumentNode.InnerText;
// Decode HTML entities
text = System.Web.HttpUtility.HtmlDecode(text);
// Clean up whitespace
text = Regex.Replace(text, @"\s+", " ");
text = Regex.Replace(text, @"^\s+|\s+$", "", RegexOptions.Multiline);
return text.Trim();
}
}
Custom Text Extraction with Formatting Control
For complete control over the conversion process, including line breaks and spacing:
using HtmlAgilityPack;
using System;
using System.Text;
public static class AdvancedHtmlToText
{
public static string ConvertWithFormatting(HtmlNode node)
{
var sb = new StringBuilder();
ConvertNode(node, sb);
return CleanupText(sb.ToString());
}
private static void ConvertNode(HtmlNode node, StringBuilder sb)
{
switch (node.NodeType)
{
case HtmlNodeType.Document:
ConvertChildren(node, sb);
break;
case HtmlNodeType.Element:
switch (node.Name.ToLower())
{
case "script":
case "style":
// Skip these elements entirely
break;
case "p":
case "div":
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
ConvertChildren(node, sb);
sb.AppendLine();
break;
case "br":
sb.AppendLine();
break;
case "li":
sb.Append("• ");
ConvertChildren(node, sb);
sb.AppendLine();
break;
default:
ConvertChildren(node, sb);
break;
}
break;
case HtmlNodeType.Text:
var text = System.Web.HttpUtility.HtmlDecode(node.InnerText);
sb.Append(text);
break;
}
}
private static void ConvertChildren(HtmlNode node, StringBuilder sb)
{
foreach (var child in node.ChildNodes)
{
ConvertNode(child, sb);
}
}
private static string CleanupText(string text)
{
// Remove excessive whitespace while preserving intentional line breaks
var lines = text.Split('\n');
var cleanedLines = new List<string>();
foreach (var line in lines)
{
var trimmed = line.Trim();
if (!string.IsNullOrEmpty(trimmed))
{
cleanedLines.Add(trimmed);
}
}
return string.Join("\n", cleanedLines);
}
}
// Usage example
class Program
{
static void Main()
{
var html = @"
<html>
<head>
<style>body { color: blue; }</style>
<script>console.log('test');</script>
</head>
<body>
<h1>Main Title</h1>
<p>This is a paragraph with <strong>bold text</strong> and <a href='#'>a link</a>.</p>
<ul>
<li>First item</li>
<li>Second item</li>
</ul>
<div>Footer content</div>
</body>
</html>";
var doc = new HtmlDocument();
doc.LoadHtml(html);
string formattedText = AdvancedHtmlToText.ConvertWithFormatting(doc.DocumentNode);
Console.WriteLine(formattedText);
}
}
Loading HTML from Different Sources
Html Agility Pack supports loading HTML from various sources:
// From string
var doc = new HtmlDocument();
doc.LoadHtml(htmlString);
// From file
var doc = new HtmlDocument();
doc.Load("path/to/file.html");
// From web URL
var web = new HtmlWeb();
var doc = web.Load("https://example.com");
// Extract text from any source
string plainText = doc.DocumentNode.InnerText;
Best Practices
- Remove unwanted elements: Always remove
<script>
and<style>
tags before text extraction - Handle HTML entities: Use
HttpUtility.HtmlDecode()
to properly decode HTML entities like&
,<
, etc. - Preserve formatting: Consider block-level elements when maintaining readability
- Clean whitespace: Remove excessive whitespace while preserving intentional spacing
- Error handling: Wrap HTML parsing in try-catch blocks for malformed HTML
Common Pitfalls
- The basic
InnerText
property may not preserve formatting and can result in text running together - Script and style content will be included unless explicitly removed
- HTML entities need manual decoding for proper text representation
- Whitespace handling requires attention to maintain readability
Html Agility Pack provides flexible options for converting HTML to plain text, from simple one-line solutions to sophisticated formatting-aware converters depending on your specific needs.