Table of contents

Can I use Colly to scrape XML sitemaps?

Yes, Colly is an excellent choice for scraping XML sitemaps in Go. Colly provides robust XML parsing capabilities through its OnXML callback, making it straightforward to extract URLs, metadata, and other information from sitemap files. This approach is particularly useful for SEO analysis, website crawling, and automated content discovery.

Understanding XML Sitemaps

XML sitemaps are structured files that help search engines discover and index website content. They typically contain:

  • URL locations (<loc> elements)
  • Last modification dates (<lastmod> elements)
  • Change frequencies (<changefreq> elements)
  • Priority values (<priority> elements)

Basic Sitemap Scraping with Colly

Here's a fundamental example of how to scrape an XML sitemap using Colly:

package main

import (
    "fmt"
    "log"
    "time"

    "github.com/gocolly/colly/v2"
)

type SitemapURL struct {
    Location    string
    LastMod     string
    ChangeFreq  string
    Priority    string
}

func main() {
    c := colly.NewCollector()

    var urls []SitemapURL

    // Set up XML callback for URL elements
    c.OnXML("//url", func(e *colly.XMLElement) {
        url := SitemapURL{
            Location:   e.ChildText("loc"),
            LastMod:    e.ChildText("lastmod"),
            ChangeFreq: e.ChildText("changefreq"),
            Priority:   e.ChildText("priority"),
        }
        urls = append(urls, url)
        fmt.Printf("Found URL: %s\n", url.Location)
    })

    // Handle errors
    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error occurred: %v", err)
    })

    // Visit the sitemap
    err := c.Visit("https://example.com/sitemap.xml")
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("Total URLs found: %d\n", len(urls))
}

Advanced Sitemap Processing

For more sophisticated sitemap processing, you can implement filtering, validation, and data processing:

package main

import (
    "fmt"
    "log"
    "regexp"
    "strings"
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
)

type SitemapProcessor struct {
    collector *colly.Collector
    urls      []SitemapURL
    filters   []URLFilter
}

type URLFilter func(SitemapURL) bool

func NewSitemapProcessor() *SitemapProcessor {
    c := colly.NewCollector(
        colly.Debugger(&debug.LogDebugger{}),
    )

    // Set user agent
    c.UserAgent = "SitemapScraper/1.0"

    // Set timeout
    c.SetRequestTimeout(30 * time.Second)

    return &SitemapProcessor{
        collector: c,
        urls:      make([]SitemapURL, 0),
        filters:   make([]URLFilter, 0),
    }
}

func (sp *SitemapProcessor) AddFilter(filter URLFilter) {
    sp.filters = append(sp.filters, filter)
}

func (sp *SitemapProcessor) ProcessSitemap(sitemapURL string) error {
    sp.collector.OnXML("//url", func(e *colly.XMLElement) {
        url := SitemapURL{
            Location:   strings.TrimSpace(e.ChildText("loc")),
            LastMod:    strings.TrimSpace(e.ChildText("lastmod")),
            ChangeFreq: strings.TrimSpace(e.ChildText("changefreq")),
            Priority:   strings.TrimSpace(e.ChildText("priority")),
        }

        // Apply filters
        for _, filter := range sp.filters {
            if !filter(url) {
                return // Skip this URL
            }
        }

        sp.urls = append(sp.urls, url)
    })

    return sp.collector.Visit(sitemapURL)
}

func (sp *SitemapProcessor) GetURLs() []SitemapURL {
    return sp.urls
}

// Example filters
func BlogPostFilter() URLFilter {
    return func(url SitemapURL) bool {
        matched, _ := regexp.MatchString(`/blog/`, url.Location)
        return matched
    }
}

func RecentlyModifiedFilter(days int) URLFilter {
    cutoff := time.Now().AddDate(0, 0, -days)
    return func(url SitemapURL) bool {
        if url.LastMod == "" {
            return true // Include URLs without lastmod
        }

        lastMod, err := time.Parse("2006-01-02", url.LastMod)
        if err != nil {
            return true // Include URLs with unparseable dates
        }

        return lastMod.After(cutoff)
    }
}

func main() {
    processor := NewSitemapProcessor()

    // Add filters
    processor.AddFilter(BlogPostFilter())
    processor.AddFilter(RecentlyModifiedFilter(30)) // Last 30 days

    err := processor.ProcessSitemap("https://example.com/sitemap.xml")
    if err != nil {
        log.Fatal(err)
    }

    urls := processor.GetURLs()
    fmt.Printf("Filtered URLs: %d\n", len(urls))

    for _, url := range urls {
        fmt.Printf("URL: %s, LastMod: %s, Priority: %s\n", 
            url.Location, url.LastMod, url.Priority)
    }
}

Handling Sitemap Index Files

Many websites use sitemap index files that reference multiple sitemap files. Here's how to handle them:

func ProcessSitemapIndex(indexURL string) error {
    c := colly.NewCollector()

    var sitemapURLs []string

    // Extract sitemap URLs from index
    c.OnXML("//sitemap/loc", func(e *colly.XMLElement) {
        sitemapURL := strings.TrimSpace(e.Text)
        sitemapURLs = append(sitemapURLs, sitemapURL)
        fmt.Printf("Found sitemap: %s\n", sitemapURL)
    })

    err := c.Visit(indexURL)
    if err != nil {
        return err
    }

    // Process each individual sitemap
    for _, sitemapURL := range sitemapURLs {
        fmt.Printf("Processing sitemap: %s\n", sitemapURL)

        processor := NewSitemapProcessor()
        err := processor.ProcessSitemap(sitemapURL)
        if err != nil {
            log.Printf("Error processing %s: %v", sitemapURL, err)
            continue
        }

        urls := processor.GetURLs()
        fmt.Printf("Found %d URLs in %s\n", len(urls), sitemapURL)
    }

    return nil
}

Error Handling and Rate Limiting

Proper error handling and rate limiting are crucial for reliable sitemap scraping:

func NewRobustSitemapProcessor() *SitemapProcessor {
    c := colly.NewCollector()

    // Add rate limiting
    limit := &colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 2,
        Delay:       1 * time.Second,
    }
    c.Limit(limit)

    // Handle HTTP errors
    c.OnResponse(func(r *colly.Response) {
        if r.StatusCode != 200 {
            log.Printf("Non-200 status code: %d for URL: %s", 
                r.StatusCode, r.Request.URL)
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Request failed: %v", err)
        if r != nil {
            log.Printf("Status Code: %d", r.StatusCode)
        }
    })

    return &SitemapProcessor{
        collector: c,
        urls:      make([]SitemapURL, 0),
        filters:   make([]URLFilter, 0),
    }
}

Validating Sitemap Data

Implement validation to ensure data quality:

func ValidateURL(url SitemapURL) bool {
    // Check if URL is valid
    if url.Location == "" {
        return false
    }

    // Validate URL format
    urlRegex := regexp.MustCompile(`^https?://[^\s/$.?#].[^\s]*$`)
    if !urlRegex.MatchString(url.Location) {
        return false
    }

    // Validate change frequency
    validFreqs := []string{"always", "hourly", "daily", "weekly", 
                          "monthly", "yearly", "never", ""}
    freqValid := false
    for _, freq := range validFreqs {
        if url.ChangeFreq == freq {
            freqValid = true
            break
        }
    }

    return freqValid
}

Comparing with JavaScript Alternatives

While Colly excels at XML sitemap processing, web scraping scenarios requiring JavaScript execution might benefit from browser automation tools. For complex dynamic content that requires JavaScript rendering, you might want to explore how to handle AJAX requests using Puppeteer for JavaScript-heavy websites.

Command Line Usage

Create a command-line tool for sitemap analysis:

# Build the sitemap scraper
go build -o sitemap-scraper main.go

# Run with different options
./sitemap-scraper -url="https://example.com/sitemap.xml" -filter="blog"
./sitemap-scraper -url="https://example.com/sitemap.xml" -days=7 -output="urls.json"

Performance Considerations

For large sitemaps, consider these optimizations:

  1. Memory Management: Process sitemaps in chunks for very large files
  2. Concurrent Processing: Use goroutines for processing multiple sitemaps
  3. Caching: Implement caching to avoid re-processing unchanged sitemaps
  4. Streaming: For extremely large sitemaps, consider streaming XML parsing
// Example of concurrent sitemap processing
func ProcessSitemapsConcurrently(sitemapURLs []string) {
    var wg sync.WaitGroup
    results := make(chan []SitemapURL, len(sitemapURLs))

    for _, url := range sitemapURLs {
        wg.Add(1)
        go func(sitemapURL string) {
            defer wg.Done()

            processor := NewSitemapProcessor()
            err := processor.ProcessSitemap(sitemapURL)
            if err != nil {
                log.Printf("Error processing %s: %v", sitemapURL, err)
                return
            }

            results <- processor.GetURLs()
        }(url)
    }

    wg.Wait()
    close(results)

    // Collect all results
    var allURLs []SitemapURL
    for urls := range results {
        allURLs = append(allURLs, urls...)
    }

    fmt.Printf("Total URLs collected: %d\n", len(allURLs))
}

Integration with Other Tools

Colly's sitemap scraping capabilities integrate well with other web scraping workflows. After extracting URLs from sitemaps, you can use the same URLs for comprehensive content analysis or combine sitemap data with browser automation tools for handling authentication in Puppeteer when dealing with protected content.

Conclusion

Colly provides robust capabilities for XML sitemap scraping in Go, offering excellent performance, built-in XML parsing, and flexible filtering options. Its OnXML callback system makes it straightforward to extract and process sitemap data, while its rate limiting and error handling features ensure reliable operation. Whether you're building SEO tools, content discovery systems, or website monitoring applications, Colly's sitemap scraping capabilities provide a solid foundation for your Go-based web scraping projects.

The combination of Colly's efficiency and Go's concurrency features makes it particularly well-suited for processing large-scale sitemap operations, ensuring your applications can handle enterprise-level website analysis tasks effectively.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon