How do I handle forms and input elements with Html Agility Pack?

Html Agility Pack provides powerful capabilities for parsing and manipulating HTML forms in C#. This guide covers extracting form data, handling different input types, and programmatic form submission.

Prerequisites

Install Html Agility Pack via NuGet Package Manager:

Install-Package HtmlAgilityPack

Loading HTML Documents

Html Agility Pack supports multiple methods for loading HTML:

using HtmlAgilityPack;

var htmlDoc = new HtmlDocument();

// From string
htmlDoc.LoadHtml(htmlString);

// From file
htmlDoc.Load("path/to/file.html");

// From web (with HttpClient)
using var client = new HttpClient();
var html = await client.GetStringAsync("https://example.com");
htmlDoc.LoadHtml(html);

Locating Forms and Input Elements

Finding Forms

// Find form by ID
var form = htmlDoc.DocumentNode.SelectSingleNode("//form[@id='loginForm']");

// Find form by class
var form = htmlDoc.DocumentNode.SelectSingleNode("//form[@class='contact-form']");

// Find first form on page
var form = htmlDoc.DocumentNode.SelectSingleNode("//form");

// Find all forms
var forms = htmlDoc.DocumentNode.SelectNodes("//form");

Extracting Form Attributes

if (form != null)
{
    string action = form.GetAttributeValue("action", "");
    string method = form.GetAttributeValue("method", "GET").ToUpperInvariant();
    string enctype = form.GetAttributeValue("enctype", "application/x-www-form-urlencoded");

    Console.WriteLine($"Form Action: {action}");
    Console.WriteLine($"Method: {method}");
    Console.WriteLine($"Encoding: {enctype}");
}

Handling Different Input Types

Text Inputs and Basic Fields

// Find all input elements in form
var inputs = form.SelectNodes(".//input");

foreach (var input in inputs ?? new HtmlNodeCollection(null))
{
    string type = input.GetAttributeValue("type", "text");
    string name = input.GetAttributeValue("name", "");
    string value = input.GetAttributeValue("value", "");

    Console.WriteLine($"Type: {type}, Name: {name}, Value: {value}");
}

Checkboxes and Radio Buttons

// Handle checkboxes
var checkboxes = form.SelectNodes(".//input[@type='checkbox']");
foreach (var checkbox in checkboxes ?? new HtmlNodeCollection(null))
{
    string name = checkbox.GetAttributeValue("name", "");
    string value = checkbox.GetAttributeValue("value", "");
    bool isChecked = checkbox.Attributes["checked"] != null;

    Console.WriteLine($"Checkbox {name}: {value} (Checked: {isChecked})");
}

// Handle radio buttons
var radioButtons = form.SelectNodes(".//input[@type='radio']");
foreach (var radio in radioButtons ?? new HtmlNodeCollection(null))
{
    string name = radio.GetAttributeValue("name", "");
    string value = radio.GetAttributeValue("value", "");
    bool isSelected = radio.Attributes["checked"] != null;

    Console.WriteLine($"Radio {name}: {value} (Selected: {isSelected})");
}

Select Dropdowns

// Handle select elements
var selects = form.SelectNodes(".//select");
foreach (var select in selects ?? new HtmlNodeCollection(null))
{
    string name = select.GetAttributeValue("name", "");

    // Get all options
    var options = select.SelectNodes(".//option");
    foreach (var option in options ?? new HtmlNodeCollection(null))
    {
        string value = option.GetAttributeValue("value", "");
        string text = option.InnerText.Trim();
        bool isSelected = option.Attributes["selected"] != null;

        Console.WriteLine($"Select {name} - Option: {text} ({value}) Selected: {isSelected}");
    }
}

Textarea Elements

// Handle textarea elements
var textareas = form.SelectNodes(".//textarea");
foreach (var textarea in textareas ?? new HtmlNodeCollection(null))
{
    string name = textarea.GetAttributeValue("name", "");
    string content = textarea.InnerText;

    Console.WriteLine($"Textarea {name}: {content}");
}

Manipulating Form Data

Setting Input Values

// Set text input value
var usernameInput = form.SelectSingleNode(".//input[@name='username']");
usernameInput?.SetAttributeValue("value", "john_doe");

// Set password
var passwordInput = form.SelectSingleNode(".//input[@name='password']");
passwordInput?.SetAttributeValue("value", "secure_password");

// Check a checkbox
var agreeCheckbox = form.SelectSingleNode(".//input[@name='agree'][@type='checkbox']");
agreeCheckbox?.SetAttributeValue("checked", "checked");

// Select radio button
var genderRadio = form.SelectSingleNode(".//input[@name='gender'][@value='male']");
genderRadio?.SetAttributeValue("checked", "checked");

Setting Select Options

// Select an option in dropdown
var countrySelect = form.SelectSingleNode(".//select[@name='country']");
if (countrySelect != null)
{
    // Remove existing selections
    var allOptions = countrySelect.SelectNodes(".//option[@selected]");
    foreach (var opt in allOptions ?? new HtmlNodeCollection(null))
    {
        opt.Attributes.Remove("selected");
    }

    // Select new option
    var targetOption = countrySelect.SelectSingleNode(".//option[@value='US']");
    targetOption?.SetAttributeValue("selected", "selected");
}

Complete Form Processing Example

public class FormProcessor
{
    public async Task<Dictionary<string, string>> ExtractFormDataAsync(string url)
    {
        var formData = new Dictionary<string, string>();

        // Load page
        using var client = new HttpClient();
        var html = await client.GetStringAsync(url);
        var doc = new HtmlDocument();
        doc.LoadHtml(html);

        // Find form
        var form = doc.DocumentNode.SelectSingleNode("//form");
        if (form == null) return formData;

        // Extract all form fields
        var inputs = form.SelectNodes(".//input | .//select | .//textarea");

        foreach (var element in inputs ?? new HtmlNodeCollection(null))
        {
            string name = element.GetAttributeValue("name", "");
            if (string.IsNullOrEmpty(name)) continue;

            string value = "";

            switch (element.Name.ToLower())
            {
                case "input":
                    string type = element.GetAttributeValue("type", "text").ToLower();
                    switch (type)
                    {
                        case "checkbox":
                        case "radio":
                            if (element.Attributes["checked"] != null)
                                value = element.GetAttributeValue("value", "on");
                            break;
                        default:
                            value = element.GetAttributeValue("value", "");
                            break;
                    }
                    break;

                case "select":
                    var selectedOption = element.SelectSingleNode(".//option[@selected]");
                    value = selectedOption?.GetAttributeValue("value", "") ?? "";
                    break;

                case "textarea":
                    value = element.InnerText;
                    break;
            }

            if (!string.IsNullOrEmpty(value))
                formData[name] = value;
        }

        return formData;
    }
}

Form Submission

Html Agility Pack handles parsing only. For form submission, use HttpClient:

public async Task<string> SubmitFormAsync(string actionUrl, Dictionary<string, string> formData, string method = "POST")
{
    using var client = new HttpClient();

    if (method.ToUpper() == "GET")
    {
        var queryString = string.Join("&", 
            formData.Select(kvp => $"{Uri.EscapeDataString(kvp.Key)}={Uri.EscapeDataString(kvp.Value)}"));
        var getUrl = $"{actionUrl}?{queryString}";

        var getResponse = await client.GetAsync(getUrl);
        return await getResponse.Content.ReadAsStringAsync();
    }
    else
    {
        var content = new FormUrlEncodedContent(formData);
        var postResponse = await client.PostAsync(actionUrl, content);
        return await postResponse.Content.ReadAsStringAsync();
    }
}

Handling Special Cases

CSRF Tokens

// Extract CSRF token
var csrfToken = form.SelectSingleNode(".//input[@name='_token']")?.GetAttributeValue("value", "");
if (!string.IsNullOrEmpty(csrfToken))
{
    formData["_token"] = csrfToken;
}

Hidden Fields

// Include all hidden fields
var hiddenInputs = form.SelectNodes(".//input[@type='hidden']");
foreach (var hidden in hiddenInputs ?? new HtmlNodeCollection(null))
{
    string name = hidden.GetAttributeValue("name", "");
    string value = hidden.GetAttributeValue("value", "");

    if (!string.IsNullOrEmpty(name))
        formData[name] = value;
}

Best Practices

  1. Always check for null: Use null-conditional operators when working with form elements
  2. Handle encoding properly: Use Uri.EscapeDataString() for URL encoding
  3. Respect robots.txt: Check website policies before scraping
  4. Handle errors gracefully: Implement proper exception handling
  5. Consider rate limiting: Add delays between requests to avoid being blocked
  6. Maintain sessions: Use CookieContainer with HttpClient for session-based forms

Security Considerations

  • Always validate and sanitize extracted data
  • Be aware of CSRF protection mechanisms
  • Respect website terms of service
  • Consider using headless browsers for JavaScript-heavy forms
  • Implement proper error handling for failed requests

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon