Html Agility Pack provides powerful capabilities for parsing and manipulating HTML forms in C#. This guide covers extracting form data, handling different input types, and programmatic form submission.
Prerequisites
Install Html Agility Pack via NuGet Package Manager:
Install-Package HtmlAgilityPack
Loading HTML Documents
Html Agility Pack supports multiple methods for loading HTML:
using HtmlAgilityPack;
var htmlDoc = new HtmlDocument();
// From string
htmlDoc.LoadHtml(htmlString);
// From file
htmlDoc.Load("path/to/file.html");
// From web (with HttpClient)
using var client = new HttpClient();
var html = await client.GetStringAsync("https://example.com");
htmlDoc.LoadHtml(html);
Locating Forms and Input Elements
Finding Forms
// Find form by ID
var form = htmlDoc.DocumentNode.SelectSingleNode("//form[@id='loginForm']");
// Find form by class
var form = htmlDoc.DocumentNode.SelectSingleNode("//form[@class='contact-form']");
// Find first form on page
var form = htmlDoc.DocumentNode.SelectSingleNode("//form");
// Find all forms
var forms = htmlDoc.DocumentNode.SelectNodes("//form");
Extracting Form Attributes
if (form != null)
{
string action = form.GetAttributeValue("action", "");
string method = form.GetAttributeValue("method", "GET").ToUpperInvariant();
string enctype = form.GetAttributeValue("enctype", "application/x-www-form-urlencoded");
Console.WriteLine($"Form Action: {action}");
Console.WriteLine($"Method: {method}");
Console.WriteLine($"Encoding: {enctype}");
}
Handling Different Input Types
Text Inputs and Basic Fields
// Find all input elements in form
var inputs = form.SelectNodes(".//input");
foreach (var input in inputs ?? new HtmlNodeCollection(null))
{
string type = input.GetAttributeValue("type", "text");
string name = input.GetAttributeValue("name", "");
string value = input.GetAttributeValue("value", "");
Console.WriteLine($"Type: {type}, Name: {name}, Value: {value}");
}
Checkboxes and Radio Buttons
// Handle checkboxes
var checkboxes = form.SelectNodes(".//input[@type='checkbox']");
foreach (var checkbox in checkboxes ?? new HtmlNodeCollection(null))
{
string name = checkbox.GetAttributeValue("name", "");
string value = checkbox.GetAttributeValue("value", "");
bool isChecked = checkbox.Attributes["checked"] != null;
Console.WriteLine($"Checkbox {name}: {value} (Checked: {isChecked})");
}
// Handle radio buttons
var radioButtons = form.SelectNodes(".//input[@type='radio']");
foreach (var radio in radioButtons ?? new HtmlNodeCollection(null))
{
string name = radio.GetAttributeValue("name", "");
string value = radio.GetAttributeValue("value", "");
bool isSelected = radio.Attributes["checked"] != null;
Console.WriteLine($"Radio {name}: {value} (Selected: {isSelected})");
}
Select Dropdowns
// Handle select elements
var selects = form.SelectNodes(".//select");
foreach (var select in selects ?? new HtmlNodeCollection(null))
{
string name = select.GetAttributeValue("name", "");
// Get all options
var options = select.SelectNodes(".//option");
foreach (var option in options ?? new HtmlNodeCollection(null))
{
string value = option.GetAttributeValue("value", "");
string text = option.InnerText.Trim();
bool isSelected = option.Attributes["selected"] != null;
Console.WriteLine($"Select {name} - Option: {text} ({value}) Selected: {isSelected}");
}
}
Textarea Elements
// Handle textarea elements
var textareas = form.SelectNodes(".//textarea");
foreach (var textarea in textareas ?? new HtmlNodeCollection(null))
{
string name = textarea.GetAttributeValue("name", "");
string content = textarea.InnerText;
Console.WriteLine($"Textarea {name}: {content}");
}
Manipulating Form Data
Setting Input Values
// Set text input value
var usernameInput = form.SelectSingleNode(".//input[@name='username']");
usernameInput?.SetAttributeValue("value", "john_doe");
// Set password
var passwordInput = form.SelectSingleNode(".//input[@name='password']");
passwordInput?.SetAttributeValue("value", "secure_password");
// Check a checkbox
var agreeCheckbox = form.SelectSingleNode(".//input[@name='agree'][@type='checkbox']");
agreeCheckbox?.SetAttributeValue("checked", "checked");
// Select radio button
var genderRadio = form.SelectSingleNode(".//input[@name='gender'][@value='male']");
genderRadio?.SetAttributeValue("checked", "checked");
Setting Select Options
// Select an option in dropdown
var countrySelect = form.SelectSingleNode(".//select[@name='country']");
if (countrySelect != null)
{
// Remove existing selections
var allOptions = countrySelect.SelectNodes(".//option[@selected]");
foreach (var opt in allOptions ?? new HtmlNodeCollection(null))
{
opt.Attributes.Remove("selected");
}
// Select new option
var targetOption = countrySelect.SelectSingleNode(".//option[@value='US']");
targetOption?.SetAttributeValue("selected", "selected");
}
Complete Form Processing Example
public class FormProcessor
{
public async Task<Dictionary<string, string>> ExtractFormDataAsync(string url)
{
var formData = new Dictionary<string, string>();
// Load page
using var client = new HttpClient();
var html = await client.GetStringAsync(url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
// Find form
var form = doc.DocumentNode.SelectSingleNode("//form");
if (form == null) return formData;
// Extract all form fields
var inputs = form.SelectNodes(".//input | .//select | .//textarea");
foreach (var element in inputs ?? new HtmlNodeCollection(null))
{
string name = element.GetAttributeValue("name", "");
if (string.IsNullOrEmpty(name)) continue;
string value = "";
switch (element.Name.ToLower())
{
case "input":
string type = element.GetAttributeValue("type", "text").ToLower();
switch (type)
{
case "checkbox":
case "radio":
if (element.Attributes["checked"] != null)
value = element.GetAttributeValue("value", "on");
break;
default:
value = element.GetAttributeValue("value", "");
break;
}
break;
case "select":
var selectedOption = element.SelectSingleNode(".//option[@selected]");
value = selectedOption?.GetAttributeValue("value", "") ?? "";
break;
case "textarea":
value = element.InnerText;
break;
}
if (!string.IsNullOrEmpty(value))
formData[name] = value;
}
return formData;
}
}
Form Submission
Html Agility Pack handles parsing only. For form submission, use HttpClient:
public async Task<string> SubmitFormAsync(string actionUrl, Dictionary<string, string> formData, string method = "POST")
{
using var client = new HttpClient();
if (method.ToUpper() == "GET")
{
var queryString = string.Join("&",
formData.Select(kvp => $"{Uri.EscapeDataString(kvp.Key)}={Uri.EscapeDataString(kvp.Value)}"));
var getUrl = $"{actionUrl}?{queryString}";
var getResponse = await client.GetAsync(getUrl);
return await getResponse.Content.ReadAsStringAsync();
}
else
{
var content = new FormUrlEncodedContent(formData);
var postResponse = await client.PostAsync(actionUrl, content);
return await postResponse.Content.ReadAsStringAsync();
}
}
Handling Special Cases
CSRF Tokens
// Extract CSRF token
var csrfToken = form.SelectSingleNode(".//input[@name='_token']")?.GetAttributeValue("value", "");
if (!string.IsNullOrEmpty(csrfToken))
{
formData["_token"] = csrfToken;
}
Hidden Fields
// Include all hidden fields
var hiddenInputs = form.SelectNodes(".//input[@type='hidden']");
foreach (var hidden in hiddenInputs ?? new HtmlNodeCollection(null))
{
string name = hidden.GetAttributeValue("name", "");
string value = hidden.GetAttributeValue("value", "");
if (!string.IsNullOrEmpty(name))
formData[name] = value;
}
Best Practices
- Always check for null: Use null-conditional operators when working with form elements
- Handle encoding properly: Use
Uri.EscapeDataString()
for URL encoding - Respect robots.txt: Check website policies before scraping
- Handle errors gracefully: Implement proper exception handling
- Consider rate limiting: Add delays between requests to avoid being blocked
- Maintain sessions: Use
CookieContainer
with HttpClient for session-based forms
Security Considerations
- Always validate and sanitize extracted data
- Be aware of CSRF protection mechanisms
- Respect website terms of service
- Consider using headless browsers for JavaScript-heavy forms
- Implement proper error handling for failed requests