Can Colly work with headless browsers for JavaScript rendering?

Understanding Colly's JavaScript Limitations

Colly is a powerful and efficient web scraping framework for Go that excels at scraping static HTML content. However, Colly does not natively support JavaScript rendering because it operates as an HTTP client rather than a full browser engine.

For websites that rely heavily on JavaScript to render content dynamically (like Single Page Applications built with React, Vue, or Angular), you'll need to combine Colly with headless browser solutions.

Solutions for JavaScript Rendering with Colly

1. Using ChromeDP with Colly (Recommended)

The most effective approach is to use ChromeDP to render JavaScript content and then pass the rendered HTML to Colly for scraping.

package main

import (
    "context"
    "fmt"
    "log"
    "strings"
    "time"

    "github.com/chromedp/chromedp"
    "github.com/gocolly/colly/v2"
)

func main() {
    // Create ChromeDP context
    ctx, cancel := chromedp.NewContext(context.Background())
    defer cancel()

    // Set timeout for browser operations
    ctx, cancel = context.WithTimeout(ctx, 30*time.Second)
    defer cancel()

    // Get rendered HTML using ChromeDP
    var renderedHTML string
    err := chromedp.Run(ctx,
        chromedp.Navigate(`https://spa-example.com`),
        // Wait for specific element that indicates content is loaded
        chromedp.WaitVisible(`#main-content`, chromedp.ByID),
        // Optional: Wait additional time for dynamic content
        chromedp.Sleep(2*time.Second),
        // Get the full page HTML
        chromedp.OuterHTML("html", &renderedHTML),
    )
    if err != nil {
        log.Fatal("ChromeDP error:", err)
    }

    // Now use Colly to scrape the rendered HTML
    c := colly.NewCollector()

    // Set up Colly callbacks
    c.OnHTML("h1", func(e *colly.HTMLElement) {
        fmt.Printf("Title: %s\n", strings.TrimSpace(e.Text))
    })

    c.OnHTML(".product-item", func(e *colly.HTMLElement) {
        name := e.ChildText(".product-name")
        price := e.ChildText(".product-price")
        fmt.Printf("Product: %s - %s\n", name, price)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Printf("Error: %s\n", err.Error())
    })

    // Parse the rendered HTML with Colly
    c.OnRequest(func(r *colly.Request) {
        // Set the body to our rendered HTML
        r.ResponseBody = []byte(renderedHTML)
    })

    // Visit the URL (Colly will use our pre-rendered HTML)
    c.Visit("https://spa-example.com")
}

2. Creating a Reusable Service

For production applications, consider creating a reusable service that combines ChromeDP and Colly:

package main

import (
    "context"
    "fmt"
    "io"
    "strings"
    "time"

    "github.com/chromedp/chromedp"
    "github.com/gocolly/colly/v2"
)

type JavaScriptScraper struct {
    collector *colly.Collector
    timeout   time.Duration
}

func NewJavaScriptScraper() *JavaScriptScraper {
    return &JavaScriptScraper{
        collector: colly.NewCollector(),
        timeout:   30 * time.Second,
    }
}

func (js *JavaScriptScraper) ScrapeWithJS(url string, waitSelector string) error {
    // Create ChromeDP context with timeout
    ctx, cancel := chromedp.NewContext(context.Background())
    defer cancel()
    ctx, cancel = context.WithTimeout(ctx, js.timeout)
    defer cancel()

    // Render page with JavaScript
    var renderedHTML string
    err := chromedp.Run(ctx,
        chromedp.Navigate(url),
        chromedp.WaitVisible(waitSelector, chromedp.ByQuery),
        chromedp.OuterHTML("html", &renderedHTML),
    )
    if err != nil {
        return fmt.Errorf("failed to render page: %w", err)
    }

    // Use Colly to scrape rendered content
    js.collector.OnRequest(func(r *colly.Request) {
        r.ResponseBody = []byte(renderedHTML)
    })

    return js.collector.Visit(url)
}

func (js *JavaScriptScraper) OnHTML(selector string, callback colly.HTMLCallback) {
    js.collector.OnHTML(selector, callback)
}

// Usage example
func main() {
    scraper := NewJavaScriptScraper()

    scraper.OnHTML("h1", func(e *colly.HTMLElement) {
        fmt.Printf("Heading: %s\n", e.Text)
    })

    scraper.OnHTML(".dynamic-content", func(e *colly.HTMLElement) {
        fmt.Printf("Dynamic content: %s\n", e.Text)
    })

    err := scraper.ScrapeWithJS("https://example-spa.com", ".main-content")
    if err != nil {
        fmt.Printf("Scraping error: %v\n", err)
    }
}

3. Using External Browser Services

For cloud deployments or when you don't want to manage browser binaries, use external services:

package main

import (
    "fmt"
    "net/http"
    "net/url"

    "github.com/gocolly/colly/v2"
)

func scrapeWithBrowserlessService(targetURL string) error {
    // Browserless.io service example
    browserlessURL := "https://your-browserless-instance.com/content"

    // Prepare request to browserless service
    params := url.Values{}
    params.Add("url", targetURL)
    params.Add("waitForSelector", ".main-content")

    fullURL := fmt.Sprintf("%s?%s", browserlessURL, params.Encode())

    // Use Colly to get rendered content from service
    c := colly.NewCollector()

    c.OnHTML("h1", func(e *colly.HTMLElement) {
        fmt.Printf("Title: %s\n", e.Text)
    })

    c.OnRequest(func(r *colly.Request) {
        // Add API key if required
        r.Headers.Set("Authorization", "Bearer your-api-key")
    })

    return c.Visit(fullURL)
}

Performance Considerations

Browser Resource Management

// Optimize ChromeDP for production use
opts := append(chromedp.DefaultExecAllocatorOptions[:],
    chromedp.DisableGPU,
    chromedp.NoSandbox,
    chromedp.Headless,
    chromedp.Flag("disable-background-timer-throttling", true),
    chromedp.Flag("disable-renderer-backgrounding", true),
    chromedp.Flag("disable-backgrounding-occluded-windows", true),
)

allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()

ctx, cancel := chromedp.NewContext(allocCtx)
defer cancel()

Connection Pooling

For high-volume scraping, consider reusing browser instances:

type BrowserPool struct {
    contexts []context.Context
    current  int
}

func (bp *BrowserPool) GetContext() context.Context {
    ctx := bp.contexts[bp.current]
    bp.current = (bp.current + 1) % len(bp.contexts)
    return ctx
}

When to Use Each Approach

| Scenario | Recommended Solution | |----------|---------------------| | Simple SPA scraping | ChromeDP + Colly | | High-volume production | External browser service | | Complex JavaScript interactions | Full Selenium/Playwright | | Cloud deployment | Browserless/Rendertron service | | Local development | ChromeDP + Colly |

Common Pitfalls and Solutions

Timing Issues: Always wait for specific elements rather than fixed delays
Memory Leaks: Properly close browser contexts and cancel operations
Rate Limiting: Implement delays between requests when using browser services
Resource Usage: Monitor CPU and memory usage in production environments

By combining Colly's efficient scraping capabilities with headless browser solutions, you can effectively scrape JavaScript-heavy websites while maintaining the performance benefits of Go.

Table of contents