Table of contents

What is the best way to parse HTML in Go?

The best way to parse HTML in Go is using the golang.org/x/net/html package, which is the official HTML parser from the Go team. This package provides a robust and efficient tokenizer and tree constructor for HTML5 documents.

Installation

First, install the package using Go modules:

go get golang.org/x/net/html

Basic HTML Parsing

Here's a complete example that demonstrates basic HTML parsing:

package main

import (
    "fmt"
    "golang.org/x/net/html"
    "log"
    "strings"
)

func main() {
    htmlContent := `
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1 id="main-title">Welcome to Go HTML Parsing</h1>
    <div class="content">
        <p>This is a paragraph with <a href="https://golang.org">a link</a>.</p>
        <ul>
            <li>Item 1</li>
            <li>Item 2</li>
        </ul>
    </div>
</body>
</html>`

    // Parse the HTML
    doc, err := html.Parse(strings.NewReader(htmlContent))
    if err != nil {
        log.Fatal(err)
    }

    // Traverse and print all element nodes
    traverse(doc, 0)
}

func traverse(n *html.Node, depth int) {
    if n.Type == html.ElementNode {
        indent := strings.Repeat("  ", depth)
        fmt.Printf("%s<%s", indent, n.Data)

        // Print attributes
        for _, attr := range n.Attr {
            fmt.Printf(" %s=\"%s\"", attr.Key, attr.Val)
        }
        fmt.Println(">")
    }

    // Recursively traverse child nodes
    for c := n.FirstChild; c != nil; c = c.NextSibling {
        traverse(c, depth+1)
    }
}

Extracting Specific Data

Here's a more practical example that extracts specific data from HTML:

package main

import (
    "fmt"
    "golang.org/x/net/html"
    "log"
    "strings"
)

func main() {
    htmlContent := `
<html>
<body>
    <h1>Product Catalog</h1>
    <div class="product" data-id="1">
        <h2>Laptop</h2>
        <span class="price">$999</span>
    </div>
    <div class="product" data-id="2">
        <h2>Mouse</h2>
        <span class="price">$29</span>
    </div>
</body>
</html>`

    doc, err := html.Parse(strings.NewReader(htmlContent))
    if err != nil {
        log.Fatal(err)
    }

    // Extract all product information
    products := extractProducts(doc)
    for _, product := range products {
        fmt.Printf("ID: %s, Name: %s, Price: %s\n", 
            product.ID, product.Name, product.Price)
    }
}

type Product struct {
    ID    string
    Name  string
    Price string
}

func extractProducts(n *html.Node) []Product {
    var products []Product

    var findProducts func(*html.Node)
    findProducts = func(node *html.Node) {
        if node.Type == html.ElementNode && node.Data == "div" {
            if hasClass(node, "product") {
                product := Product{
                    ID: getAttr(node, "data-id"),
                }

                // Find product name and price
                var extractData func(*html.Node)
                extractData = func(child *html.Node) {
                    if child.Type == html.ElementNode {
                        switch child.Data {
                        case "h2":
                            product.Name = getTextContent(child)
                        case "span":
                            if hasClass(child, "price") {
                                product.Price = getTextContent(child)
                            }
                        }
                    }
                    for c := child.FirstChild; c != nil; c = c.NextSibling {
                        extractData(c)
                    }
                }

                extractData(node)
                products = append(products, product)
            }
        }

        for c := node.FirstChild; c != nil; c = c.NextSibling {
            findProducts(c)
        }
    }

    findProducts(n)
    return products
}

// Helper functions
func hasClass(n *html.Node, className string) bool {
    for _, attr := range n.Attr {
        if attr.Key == "class" {
            classes := strings.Fields(attr.Val)
            for _, class := range classes {
                if class == className {
                    return true
                }
            }
        }
    }
    return false
}

func getAttr(n *html.Node, key string) string {
    for _, attr := range n.Attr {
        if attr.Key == key {
            return attr.Val
        }
    }
    return ""
}

func getTextContent(n *html.Node) string {
    var result strings.Builder
    var extract func(*html.Node)
    extract = func(node *html.Node) {
        if node.Type == html.TextNode {
            result.WriteString(strings.TrimSpace(node.Data))
        }
        for c := node.FirstChild; c != nil; c = c.NextSibling {
            extract(c)
        }
    }
    extract(n)
    return result.String()
}

Parsing HTML from Web Requests

When parsing HTML from web pages, combine the parser with HTTP requests:

package main

import (
    "fmt"
    "golang.org/x/net/html"
    "log"
    "net/http"
    "time"
)

func main() {
    // Create HTTP client with timeout
    client := &http.Client{
        Timeout: 10 * time.Second,
    }

    // Fetch HTML from URL
    resp, err := client.Get("https://example.com")
    if err != nil {
        log.Fatal(err)
    }
    defer resp.Body.Close()

    // Check response status
    if resp.StatusCode != http.StatusOK {
        log.Fatalf("HTTP error: %d %s", resp.StatusCode, resp.Status)
    }

    // Parse HTML directly from response body
    doc, err := html.Parse(resp.Body)
    if err != nil {
        log.Fatal(err)
    }

    // Extract page title
    title := findTitle(doc)
    fmt.Printf("Page title: %s\n", title)
}

func findTitle(n *html.Node) string {
    var title string
    var find func(*html.Node)
    find = func(node *html.Node) {
        if node.Type == html.ElementNode && node.Data == "title" {
            title = getTextContent(node)
            return
        }
        for c := node.FirstChild; c != nil; c = c.NextSibling {
            find(c)
        }
    }
    find(n)
    return title
}

Understanding Node Types

The html.Node supports several node types:

switch n.Type {
case html.ErrorNode:
    // Parse error occurred
case html.TextNode:
    // Text content
    fmt.Println("Text:", n.Data)
case html.DocumentNode:
    // Root document node
case html.ElementNode:
    // HTML element (tag)
    fmt.Println("Element:", n.Data)
case html.CommentNode:
    // HTML comment
    fmt.Println("Comment:", n.Data)
case html.DoctypeNode:
    // DOCTYPE declaration
}

Alternative Libraries

While golang.org/x/net/html is the standard choice, you might also consider:

  • goquery: jQuery-like DOM manipulation (built on top of net/html)
  • colly: Full-featured web scraping framework

Install goquery for CSS selector support:

go get github.com/PuerkitoBio/goquery
import "github.com/PuerkitoBio/goquery"

doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
    log.Fatal(err)
}

// Use CSS selectors
doc.Find(".product h2").Each(func(i int, s *goquery.Selection) {
    fmt.Println("Product:", s.Text())
})

Best Practices

  1. Always handle errors when parsing HTML
  2. Use timeouts for HTTP requests
  3. Validate node types before accessing node data
  4. Consider using goquery for complex CSS selector needs
  5. Implement proper error recovery for malformed HTML
  6. Be respectful when scraping websites (add delays, respect robots.txt)

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon