How do I extract data from specific HTML elements using Colly?

Colly is a powerful Go framework for web scraping that provides elegant methods for extracting data from HTML elements. This guide covers the essential techniques for targeting and extracting data from specific HTML elements using Colly's built-in selection capabilities.

Basic Element Selection with Colly

Colly uses the OnHTML callback function to select and process HTML elements. The callback accepts CSS selectors similar to jQuery, making it familiar for developers with front-end experience.

Simple Element Selection

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Extract text from all h1 elements
    c.OnHTML("h1", func(e *colly.HTMLElement) {
        fmt.Println("Title:", e.Text)
    })

    // Extract href attributes from all links
    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
        link := e.Attr("href")
        fmt.Println("Link:", link)
    })

    // Visit the target URL
    c.Visit("https://example.com")
}

Advanced CSS Selectors

Colly supports complex CSS selectors for precise element targeting:

c := colly.NewCollector()

// Select elements by class
c.OnHTML(".product-title", func(e *colly.HTMLElement) {
    fmt.Println("Product:", e.Text)
})

// Select elements by ID
c.OnHTML("#main-content", func(e *colly.HTMLElement) {
    fmt.Println("Main content:", e.Text)
})

// Select nested elements
c.OnHTML("div.container > p", func(e *colly.HTMLElement) {
    fmt.Println("Paragraph:", e.Text)
})

// Select elements with specific attributes
c.OnHTML("img[alt]", func(e *colly.HTMLElement) {
    alt := e.Attr("alt")
    src := e.Attr("src")
    fmt.Printf("Image: %s (source: %s)\n", alt, src)
})

// Select multiple classes
c.OnHTML(".item.featured", func(e *colly.HTMLElement) {
    fmt.Println("Featured item:", e.Text)
})

Extracting Different Types of Data

Text Content Extraction

c := colly.NewCollector()

// Extract plain text content
c.OnHTML("p", func(e *colly.HTMLElement) {
    text := e.Text
    fmt.Println("Paragraph text:", text)
})

// Extract and clean text content
c.OnHTML(".description", func(e *colly.HTMLElement) {
    text := strings.TrimSpace(e.Text)
    if text != "" {
        fmt.Println("Description:", text)
    }
})

Attribute Extraction

c := colly.NewCollector()

// Extract multiple attributes from the same element
c.OnHTML("a", func(e *colly.HTMLElement) {
    href := e.Attr("href")
    title := e.Attr("title")
    text := e.Text

    fmt.Printf("Link: %s, Title: %s, Text: %s\n", href, title, text)
})

// Extract data attributes
c.OnHTML("[data-price]", func(e *colly.HTMLElement) {
    price := e.Attr("data-price")
    currency := e.Attr("data-currency")
    fmt.Printf("Price: %s %s\n", price, currency)
})

HTML Content Extraction

c := colly.NewCollector()

// Extract inner HTML
c.OnHTML(".content-block", func(e *colly.HTMLElement) {
    innerHTML := e.ChildText("*")
    fmt.Println("Inner HTML:", innerHTML)
})

// Extract outer HTML
c.OnHTML("article", func(e *colly.HTMLElement) {
    outerHTML := e.DOM.Get(0).Data
    fmt.Println("Outer HTML:", outerHTML)
})

Working with Complex HTML Structures

Extracting Data from Tables

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

type TableRow struct {
    Name  string
    Value string
    Type  string
}

func main() {
    c := colly.NewCollector()
    var tableData []TableRow

    // Extract data from table rows
    c.OnHTML("table tbody tr", func(e *colly.HTMLElement) {
        row := TableRow{
            Name:  e.ChildText("td:nth-child(1)"),
            Value: e.ChildText("td:nth-child(2)"),
            Type:  e.ChildText("td:nth-child(3)"),
        }
        tableData = append(tableData, row)
    })

    c.OnScraped(func(r *colly.Response) {
        for _, row := range tableData {
            fmt.Printf("Name: %s, Value: %s, Type: %s\n", 
                row.Name, row.Value, row.Type)
        }
    })

    c.Visit("https://example.com/data-table")
}

Extracting Structured Data from Lists

c := colly.NewCollector()

type Product struct {
    Title       string
    Price       string
    Description string
    ImageURL    string
}

var products []Product

// Extract product information from list items
c.OnHTML(".product-list .product-item", func(e *colly.HTMLElement) {
    product := Product{
        Title:       e.ChildText(".product-title"),
        Price:       e.ChildText(".price"),
        Description: e.ChildText(".description"),
        ImageURL:    e.ChildAttr("img", "src"),
    }
    products = append(products, product)
})

Handling Nested Elements

c := colly.NewCollector()

// Extract data from nested structures
c.OnHTML(".article", func(e *colly.HTMLElement) {
    title := e.ChildText("h2.title")
    author := e.ChildText(".meta .author")
    date := e.ChildText(".meta .date")
    content := e.ChildText(".content p")

    // Extract all tags
    var tags []string
    e.ForEach(".tags .tag", func(i int, el *colly.HTMLElement) {
        tags = append(tags, el.Text)
    })

    fmt.Printf("Article: %s by %s on %s\n", title, author, date)
    fmt.Printf("Tags: %v\n", tags)
    fmt.Printf("Content: %s\n", content)
})

Advanced Selection Techniques

Using ForEach for Multiple Elements

c := colly.NewCollector()

// Process multiple similar elements
c.OnHTML(".container", func(e *colly.HTMLElement) {
    // Extract all product cards within the container
    e.ForEach(".product-card", func(i int, el *colly.HTMLElement) {
        name := el.ChildText(".product-name")
        price := el.ChildText(".price")
        fmt.Printf("Product %d: %s - %s\n", i+1, name, price)
    })
})

Conditional Data Extraction

c := colly.NewCollector()

c.OnHTML(".item", func(e *colly.HTMLElement) {
    // Check if element has specific class before processing
    if e.Attr("class") == "item featured" {
        title := e.ChildText(".title")
        fmt.Println("Featured item:", title)
    }

    // Check for presence of child elements
    if e.DOM.Find(".price").Length() > 0 {
        price := e.ChildText(".price")
        fmt.Println("Price available:", price)
    }
})

Working with Dynamic Content

While Colly excels at static HTML extraction, you might need browser automation tools like Puppeteer for handling dynamic content that requires JavaScript execution.

c := colly.NewCollector()

// Set up request delay for rate limiting
c.Limit(&colly.LimitRule{
    DomainGlob:  "*",
    Parallelism: 2,
    Delay:       1 * time.Second,
})

// Extract data that might be loaded via AJAX
c.OnHTML("[data-loaded='true']", func(e *colly.HTMLElement) {
    // Process only elements that have been marked as loaded
    content := e.Text
    fmt.Println("Dynamic content:", content)
})

Error Handling and Validation

Robust Data Extraction

c := colly.NewCollector()

c.OnHTML(".product", func(e *colly.HTMLElement) {
    // Safe attribute extraction with validation
    title := strings.TrimSpace(e.ChildText(".title"))
    if title == "" {
        fmt.Println("Warning: Empty title found")
        return
    }

    priceText := e.ChildText(".price")
    if priceText == "" {
        fmt.Println("Warning: Price not found for", title)
    }

    // Validate URL before processing
    imageURL := e.ChildAttr("img", "src")
    if imageURL != "" && !strings.HasPrefix(imageURL, "http") {
        // Convert relative URL to absolute
        imageURL = e.Request.AbsoluteURL(imageURL)
    }

    fmt.Printf("Product: %s, Price: %s, Image: %s\n", 
        title, priceText, imageURL)
})

// Handle errors
c.OnError(func(r *colly.Response, err error) {
    fmt.Printf("Error: %s\n", err.Error())
})

Performance Optimization

Efficient Element Selection

c := colly.NewCollector()

// Use specific selectors to reduce processing overhead
c.OnHTML("div.content article.post", func(e *colly.HTMLElement) {
    // More efficient than using broad selectors like "div article"
    title := e.ChildText("h1")
    content := e.ChildText(".post-content")
    fmt.Printf("Post: %s\n", title)
})

// Limit processing to necessary elements only
c.OnHTML(".main-content", func(e *colly.HTMLElement) {
    // Process only the main content area instead of entire page
    e.ForEach(".item", func(i int, el *colly.HTMLElement) {
        // Extract only required data
        name := el.ChildText(".name")
        if name != "" {
            fmt.Println("Item:", name)
        }
    })
})

Integration with Data Storage

Structured Data Extraction

package main

import (
    "encoding/json"
    "fmt"
    "github.com/gocolly/colly/v2"
    "os"
)

type ScrapedData struct {
    Articles []Article `json:"articles"`
}

type Article struct {
    Title    string   `json:"title"`
    Author   string   `json:"author"`
    Date     string   `json:"date"`
    Content  string   `json:"content"`
    Tags     []string `json:"tags"`
    URL      string   `json:"url"`
}

func main() {
    c := colly.NewCollector()
    var data ScrapedData

    c.OnHTML("article", func(e *colly.HTMLElement) {
        var tags []string
        e.ForEach(".tag", func(i int, el *colly.HTMLElement) {
            tags = append(tags, el.Text)
        })

        article := Article{
            Title:   e.ChildText("h1"),
            Author:  e.ChildText(".author"),
            Date:    e.ChildText(".date"),
            Content: e.ChildText(".content"),
            Tags:    tags,
            URL:     e.Request.URL.String(),
        }
        data.Articles = append(data.Articles, article)
    })

    c.OnScraped(func(r *colly.Response) {
        // Save extracted data to JSON file
        jsonData, _ := json.MarshalIndent(data, "", "  ")
        os.WriteFile("scraped_data.json", jsonData, 0644)
        fmt.Printf("Extracted %d articles\n", len(data.Articles))
    })

    c.Visit("https://example.com/articles")
}

Best Practices for Element Selection

Use Specific Selectors: Choose the most specific CSS selector that reliably targets your desired elements
Validate Data: Always check for empty or missing data before processing
Handle Errors Gracefully: Implement proper error handling for missing elements
Optimize Performance: Use targeted selectors to minimize unnecessary processing
Test Selectors: Validate your CSS selectors in browser developer tools before implementation

Conclusion

Colly provides powerful and flexible methods for extracting data from HTML elements using familiar CSS selectors. By combining proper element selection techniques with robust error handling and data validation, you can build efficient and reliable web scrapers. For more complex scenarios involving dynamic content, consider integrating Colly with browser automation tools or exploring advanced request handling techniques.

The key to successful data extraction with Colly lies in understanding your target website's HTML structure and choosing the appropriate selection methods for your specific use case. Start with simple selectors and gradually build complexity as needed, always prioritizing code maintainability and error resilience.

Table of contents