The best way to parse HTML in Go is using the golang.org/x/net/html
package, which is the official HTML parser from the Go team. This package provides a robust and efficient tokenizer and tree constructor for HTML5 documents.
Installation
First, install the package using Go modules:
go get golang.org/x/net/html
Basic HTML Parsing
Here's a complete example that demonstrates basic HTML parsing:
package main
import (
"fmt"
"golang.org/x/net/html"
"log"
"strings"
)
func main() {
htmlContent := `
<!DOCTYPE html>
<html>
<head>
<title>Sample Page</title>
</head>
<body>
<h1 id="main-title">Welcome to Go HTML Parsing</h1>
<div class="content">
<p>This is a paragraph with <a href="https://golang.org">a link</a>.</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</div>
</body>
</html>`
// Parse the HTML
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
log.Fatal(err)
}
// Traverse and print all element nodes
traverse(doc, 0)
}
func traverse(n *html.Node, depth int) {
if n.Type == html.ElementNode {
indent := strings.Repeat(" ", depth)
fmt.Printf("%s<%s", indent, n.Data)
// Print attributes
for _, attr := range n.Attr {
fmt.Printf(" %s=\"%s\"", attr.Key, attr.Val)
}
fmt.Println(">")
}
// Recursively traverse child nodes
for c := n.FirstChild; c != nil; c = c.NextSibling {
traverse(c, depth+1)
}
}
Extracting Specific Data
Here's a more practical example that extracts specific data from HTML:
package main
import (
"fmt"
"golang.org/x/net/html"
"log"
"strings"
)
func main() {
htmlContent := `
<html>
<body>
<h1>Product Catalog</h1>
<div class="product" data-id="1">
<h2>Laptop</h2>
<span class="price">$999</span>
</div>
<div class="product" data-id="2">
<h2>Mouse</h2>
<span class="price">$29</span>
</div>
</body>
</html>`
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
log.Fatal(err)
}
// Extract all product information
products := extractProducts(doc)
for _, product := range products {
fmt.Printf("ID: %s, Name: %s, Price: %s\n",
product.ID, product.Name, product.Price)
}
}
type Product struct {
ID string
Name string
Price string
}
func extractProducts(n *html.Node) []Product {
var products []Product
var findProducts func(*html.Node)
findProducts = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == "div" {
if hasClass(node, "product") {
product := Product{
ID: getAttr(node, "data-id"),
}
// Find product name and price
var extractData func(*html.Node)
extractData = func(child *html.Node) {
if child.Type == html.ElementNode {
switch child.Data {
case "h2":
product.Name = getTextContent(child)
case "span":
if hasClass(child, "price") {
product.Price = getTextContent(child)
}
}
}
for c := child.FirstChild; c != nil; c = c.NextSibling {
extractData(c)
}
}
extractData(node)
products = append(products, product)
}
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
findProducts(c)
}
}
findProducts(n)
return products
}
// Helper functions
func hasClass(n *html.Node, className string) bool {
for _, attr := range n.Attr {
if attr.Key == "class" {
classes := strings.Fields(attr.Val)
for _, class := range classes {
if class == className {
return true
}
}
}
}
return false
}
func getAttr(n *html.Node, key string) string {
for _, attr := range n.Attr {
if attr.Key == key {
return attr.Val
}
}
return ""
}
func getTextContent(n *html.Node) string {
var result strings.Builder
var extract func(*html.Node)
extract = func(node *html.Node) {
if node.Type == html.TextNode {
result.WriteString(strings.TrimSpace(node.Data))
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
extract(c)
}
}
extract(n)
return result.String()
}
Parsing HTML from Web Requests
When parsing HTML from web pages, combine the parser with HTTP requests:
package main
import (
"fmt"
"golang.org/x/net/html"
"log"
"net/http"
"time"
)
func main() {
// Create HTTP client with timeout
client := &http.Client{
Timeout: 10 * time.Second,
}
// Fetch HTML from URL
resp, err := client.Get("https://example.com")
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
// Check response status
if resp.StatusCode != http.StatusOK {
log.Fatalf("HTTP error: %d %s", resp.StatusCode, resp.Status)
}
// Parse HTML directly from response body
doc, err := html.Parse(resp.Body)
if err != nil {
log.Fatal(err)
}
// Extract page title
title := findTitle(doc)
fmt.Printf("Page title: %s\n", title)
}
func findTitle(n *html.Node) string {
var title string
var find func(*html.Node)
find = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == "title" {
title = getTextContent(node)
return
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
find(c)
}
}
find(n)
return title
}
Understanding Node Types
The html.Node
supports several node types:
switch n.Type {
case html.ErrorNode:
// Parse error occurred
case html.TextNode:
// Text content
fmt.Println("Text:", n.Data)
case html.DocumentNode:
// Root document node
case html.ElementNode:
// HTML element (tag)
fmt.Println("Element:", n.Data)
case html.CommentNode:
// HTML comment
fmt.Println("Comment:", n.Data)
case html.DoctypeNode:
// DOCTYPE declaration
}
Alternative Libraries
While golang.org/x/net/html
is the standard choice, you might also consider:
- goquery: jQuery-like DOM manipulation (built on top of
net/html
) - colly: Full-featured web scraping framework
Install goquery for CSS selector support:
go get github.com/PuerkitoBio/goquery
import "github.com/PuerkitoBio/goquery"
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
log.Fatal(err)
}
// Use CSS selectors
doc.Find(".product h2").Each(func(i int, s *goquery.Selection) {
fmt.Println("Product:", s.Text())
})
Best Practices
- Always handle errors when parsing HTML
- Use timeouts for HTTP requests
- Validate node types before accessing node data
- Consider using goquery for complex CSS selector needs
- Implement proper error recovery for malformed HTML
- Be respectful when scraping websites (add delays, respect robots.txt)