How can I scrape Google Search results using Go and Colly?

Scraping Google Search results using Go and the Colly framework is an effective approach for gathering search data programmatically. Colly is a fast and elegant scraping framework for Go that provides excellent performance and built-in features for handling complex scraping scenarios like Google Search.

Setting Up Colly for Google Search Scraping

First, install Colly and its dependencies:

go mod init google-scraper
go get -u github.com/gocolly/colly/v2
go get -u github.com/gocolly/colly/v2/debug
go get -u github.com/gocolly/colly/v2/proxy

Basic Google Search Scraper Implementation

Here's a complete implementation for scraping Google Search results:

package main

import (
    "fmt"
    "log"
    "net/url"
    "strings"
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
)

type SearchResult struct {
    Title       string
    URL         string
    Description string
    Position    int
}

func main() {
    // Create a new collector with configuration
    c := colly.NewCollector(
        colly.Debugger(&debug.LogDebugger{}),
        colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"),
    )

    // Configure request delays and limits
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*google.*",
        Parallelism: 1,
        Delay:       2 * time.Second,
    })

    var results []SearchResult
    position := 1

    // Set up callbacks for search results
    c.OnHTML("div.g", func(e *colly.HTMLElement) {
        result := SearchResult{
            Position: position,
        }

        // Extract title and URL
        titleElement := e.DOM.Find("h3").First()
        if titleElement.Length() > 0 {
            result.Title = strings.TrimSpace(titleElement.Text())

            // Get the URL from the parent link
            linkElement := titleElement.ParentsFiltered("a").First()
            if href, exists := linkElement.Attr("href"); exists {
                if cleanURL := cleanGoogleURL(href); cleanURL != "" {
                    result.URL = cleanURL
                }
            }
        }

        // Extract description
        descElement := e.DOM.Find("span[style*='-webkit-line-clamp:2']").First()
        if descElement.Length() == 0 {
            descElement = e.DOM.Find("div[style*='-webkit-line-clamp:2']").First()
        }
        if descElement.Length() > 0 {
            result.Description = strings.TrimSpace(descElement.Text())
        }

        // Only add valid results
        if result.Title != "" && result.URL != "" {
            results = append(results, result)
            position++
        }
    })

    // Handle errors
    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error scraping %s: %v", r.Request.URL, err)
    })

    // Log requests
    c.OnRequest(func(r *colly.Request) {
        log.Printf("Visiting: %s", r.URL.String())
    })

    // Scrape search results
    query := "golang web scraping"
    searchURL := buildGoogleSearchURL(query, 0)

    err := c.Visit(searchURL)
    if err != nil {
        log.Fatal(err)
    }

    // Print results
    fmt.Printf("Found %d results for query: %s\n\n", len(results), query)
    for _, result := range results {
        fmt.Printf("Position: %d\n", result.Position)
        fmt.Printf("Title: %s\n", result.Title)
        fmt.Printf("URL: %s\n", result.URL)
        fmt.Printf("Description: %s\n", result.Description)
        fmt.Println("---")
    }
}

func buildGoogleSearchURL(query string, start int) string {
    baseURL := "https://www.google.com/search"
    params := url.Values{}
    params.Add("q", query)
    params.Add("start", fmt.Sprintf("%d", start))
    params.Add("num", "10") // Number of results per page

    return fmt.Sprintf("%s?%s", baseURL, params.Encode())
}

func cleanGoogleURL(rawURL string) string {
    // Remove Google's redirect parameters
    if strings.HasPrefix(rawURL, "/url?q=") {
        parsed, err := url.Parse(rawURL)
        if err != nil {
            return ""
        }
        return parsed.Query().Get("q")
    }
    return rawURL
}

Advanced Features and Best Practices

1. Implementing Pagination Support

func scrapeMultiplePages(c *colly.Collector, query string, maxPages int) []SearchResult {
    var allResults []SearchResult

    for page := 0; page < maxPages; page++ {
        start := page * 10
        searchURL := buildGoogleSearchURL(query, start)

        var pageResults []SearchResult
        position := start + 1

        c.OnHTML("div.g", func(e *colly.HTMLElement) {
            // Extract result data (same as before)
            result := extractSearchResult(e, position)
            if result.Title != "" && result.URL != "" {
                pageResults = append(pageResults, result)
                position++
            }
        })

        err := c.Visit(searchURL)
        if err != nil {
            log.Printf("Error visiting page %d: %v", page+1, err)
            break
        }

        allResults = append(allResults, pageResults...)

        // Add delay between pages
        time.Sleep(3 * time.Second)
    }

    return allResults
}

2. Enhanced Anti-Detection Measures

func setupAdvancedCollector() *colly.Collector {
    c := colly.NewCollector(
        colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"),
    )

    // Set up realistic headers
    c.OnRequest(func(r *colly.Request) {
        r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
        r.Headers.Set("Accept-Language", "en-US,en;q=0.5")
        r.Headers.Set("Accept-Encoding", "gzip, deflate")
        r.Headers.Set("DNT", "1")
        r.Headers.Set("Connection", "keep-alive")
        r.Headers.Set("Upgrade-Insecure-Requests", "1")
        r.Headers.Set("Cache-Control", "max-age=0")
    })

    // Configure rate limiting
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*google.*",
        Parallelism: 1,
        RandomDelay: 5 * time.Second,
    })

    return c
}

3. Proxy Rotation Implementation

import (
    "github.com/gocolly/colly/v2/proxy"
)

func setupProxyRotation(c *colly.Collector) {
    // List of proxy servers
    proxies := []string{
        "http://proxy1.example.com:8080",
        "http://proxy2.example.com:8080",
        "http://proxy3.example.com:8080",
    }

    // Create proxy switcher
    rp, err := proxy.RoundRobinProxySwitcher(proxies...)
    if err != nil {
        log.Fatal(err)
    }

    c.SetProxyFunc(rp)
}

Handling Different Google Search Elements

Extracting Featured Snippets

c.OnHTML("div[data-attrid]", func(e *colly.HTMLElement) {
    // Extract featured snippets
    if strings.Contains(e.AttrOr("data-attrid", ""), "kc:/") {
        snippet := e.DOM.Find("span").Text()
        fmt.Printf("Featured Snippet: %s\n", snippet)
    }
})

Extracting Knowledge Panel Information

c.OnHTML("div[data-async-context*='kp']", func(e *colly.HTMLElement) {
    // Extract knowledge panel data
    title := e.DOM.Find("h2").First().Text()
    description := e.DOM.Find("div[data-attrid='description'] span").Text()

    fmt.Printf("Knowledge Panel - Title: %s\n", title)
    fmt.Printf("Knowledge Panel - Description: %s\n", description)
})

Error Handling and Robustness

func createRobustScraper() *colly.Collector {
    c := colly.NewCollector()

    // Set up retry mechanism
    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error: %v, retrying...", err)

        // Retry after delay
        time.Sleep(5 * time.Second)
        r.Request.Retry()
    })

    // Handle different HTTP status codes
    c.OnResponse(func(r *colly.Response) {
        if r.StatusCode == 429 {
            log.Println("Rate limited, waiting longer...")
            time.Sleep(30 * time.Second)
        }
    })

    return c
}

Complete Working Example with JSON Output

package main

import (
    "encoding/json"
    "fmt"
    "log"
    "os"
    "time"

    "github.com/gocolly/colly/v2"
)

type GoogleSearchResults struct {
    Query     string         `json:"query"`
    Results   []SearchResult `json:"results"`
    Timestamp time.Time      `json:"timestamp"`
}

func main() {
    if len(os.Args) < 2 {
        log.Fatal("Usage: go run main.go 'search query'")
    }

    query := os.Args[1]
    c := setupAdvancedCollector()

    var results []SearchResult
    position := 1

    c.OnHTML("div.g", func(e *colly.HTMLElement) {
        result := extractSearchResult(e, position)
        if result.Title != "" && result.URL != "" {
            results = append(results, result)
            position++
        }
    })

    searchURL := buildGoogleSearchURL(query, 0)
    err := c.Visit(searchURL)
    if err != nil {
        log.Fatal(err)
    }

    // Create final results structure
    searchResults := GoogleSearchResults{
        Query:     query,
        Results:   results,
        Timestamp: time.Now(),
    }

    // Output as JSON
    jsonData, err := json.MarshalIndent(searchResults, "", "  ")
    if err != nil {
        log.Fatal(err)
    }

    fmt.Println(string(jsonData))
}

JavaScript-Rendered Content Handling

For modern Google Search pages that heavily rely on JavaScript, you might need to combine Colly with headless browser automation. While single page application scraping typically requires full browser automation, Colly can still handle most Google Search results effectively.

// Alternative approach for JavaScript-heavy content
func setupWithJavaScriptSupport(c *colly.Collector) {
    // Use chromedp or rod for JavaScript execution
    // This is more resource-intensive but handles dynamic content
    c.OnHTML("html", func(e *colly.HTMLElement) {
        // Execute JavaScript if needed
        // Wait for content to load
    })
}

Important Considerations

Rate Limiting and Respectful Scraping

Always implement proper rate limiting to avoid being blocked:

# Set reasonable delays between requests
c.Limit(&colly.LimitRule{
    DomainGlob:  "*google.*",
    Parallelism: 1,
    Delay:       3 * time.Second,
})

Legal and Ethical Guidelines

Review Google's Terms of Service before scraping
Implement respectful crawling practices
Consider using official APIs when available
Respect robots.txt guidelines

Alternative Approaches

For complex authentication flows or scenarios requiring sophisticated session management, consider combining Colly with other tools or using dedicated browser automation solutions.

Testing Your Scraper

# Run the scraper
go run main.go "golang web scraping"

# Test with different queries
go run main.go "machine learning python"

# Save results to file
go run main.go "data science" > results.json

Monitoring and Debugging

// Add comprehensive logging
c.OnRequest(func(r *colly.Request) {
    log.Printf("Visiting: %s", r.URL.String())
})

c.OnResponse(func(r *colly.Response) {
    log.Printf("Response status: %d for %s", r.StatusCode, r.Request.URL)
})

c.OnError(func(r *colly.Response, err error) {
    log.Printf("Error: %v on %s", err, r.Request.URL)
})

Conclusion

Scraping Google Search results with Go and Colly provides an efficient and performant solution for gathering search data. The framework's built-in features for handling cookies, user agents, and rate limiting make it well-suited for this task. Remember to implement proper error handling, respect rate limits, and follow ethical scraping practices.

By following the examples and best practices outlined in this guide, you can build robust Google Search scrapers that handle various edge cases while minimizing the risk of being blocked. Always test your implementation thoroughly and monitor for changes in Google's page structure that might require updates to your selectors.

Table of contents