What are the best practices for error handling in Colly?

Error handling is crucial for building robust web scrapers with Colly, the popular Go web scraping framework. Proper error management ensures your scrapers can handle network failures, timeouts, HTTP errors, and parsing issues gracefully. This guide covers comprehensive strategies for implementing effective error handling in your Colly applications.

Understanding Colly's Error Types

Colly encounters various types of errors during web scraping operations:

Network errors: Connection timeouts, DNS resolution failures, network unreachability
HTTP errors: 4xx client errors, 5xx server errors, redirect loops
Parsing errors: Invalid HTML, encoding issues, malformed content
Application errors: Custom validation failures, data processing errors

1. Using OnError Callbacks

The most fundamental error handling mechanism in Colly is the OnError callback, which catches all errors that occur during request processing.

Basic OnError Implementation

package main

import (
    "fmt"
    "log"
    "net/http"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Handle all errors
    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error occurred: %v", err)
        log.Printf("Request URL: %s", r.Request.URL)
        log.Printf("Status Code: %d", r.StatusCode)

        // Log response body for debugging (limit size)
        if len(r.Body) < 1000 {
            log.Printf("Response body: %s", string(r.Body))
        }
    })

    c.Visit("https://example.com")
}

Advanced Error Classification

c.OnError(func(r *colly.Response, err error) {
    switch {
    case r.StatusCode >= 500:
        log.Printf("Server error (%d): %v", r.StatusCode, err)
        // Implement retry logic for server errors

    case r.StatusCode >= 400:
        log.Printf("Client error (%d): %v", r.StatusCode, err)
        // Log and skip for client errors

    case r.StatusCode == 0:
        log.Printf("Network error: %v", err)
        // Handle network connectivity issues

    default:
        log.Printf("Unknown error: %v", err)
    }
})

2. Implementing Retry Logic

Robust error handling often requires retry mechanisms for transient failures.

Simple Retry Implementation

package main

import (
    "fmt"
    "log"
    "time"

    "github.com/gocolly/colly/v2"
)

func createCollectorWithRetry(maxRetries int) *colly.Collector {
    c := colly.NewCollector()

    retryCount := make(map[string]int)

    c.OnError(func(r *colly.Response, err error) {
        url := r.Request.URL.String()
        retryCount[url]++

        if retryCount[url] <= maxRetries {
            log.Printf("Retrying %s (attempt %d/%d): %v", 
                url, retryCount[url], maxRetries, err)

            // Exponential backoff
            time.Sleep(time.Duration(retryCount[url]) * time.Second)

            // Retry the request
            r.Request.Retry()
        } else {
            log.Printf("Max retries exceeded for %s: %v", url, err)
            delete(retryCount, url) // Clean up
        }
    })

    return c
}

Conditional Retry Logic

c.OnError(func(r *colly.Response, err error) {
    url := r.Request.URL.String()

    // Only retry on specific conditions
    shouldRetry := func(statusCode int, err error) bool {
        // Retry on server errors and timeouts
        if statusCode >= 500 || statusCode == 0 {
            return true
        }

        // Retry on specific error types
        if err != nil {
            errStr := err.Error()
            return strings.Contains(errStr, "timeout") ||
                   strings.Contains(errStr, "connection reset")
        }

        return false
    }

    if shouldRetry(r.StatusCode, err) && retryCount[url] < maxRetries {
        // Implement retry with backoff
        time.Sleep(time.Duration(retryCount[url]+1) * 2 * time.Second)
        r.Request.Retry()
        retryCount[url]++
    }
})

3. Timeout Management

Proper timeout configuration prevents scrapers from hanging indefinitely.

Setting Multiple Timeout Types

import (
    "context"
    "net/http"
    "time"

    "github.com/gocolly/colly/v2"
)

func createCollectorWithTimeouts() *colly.Collector {
    c := colly.NewCollector()

    // Configure HTTP client with timeouts
    c.WithTransport(&http.Transport{
        TLSHandshakeTimeout:   10 * time.Second,
        ResponseHeaderTimeout: 10 * time.Second,
        IdleConnTimeout:       30 * time.Second,
    })

    // Set overall request timeout
    c.SetRequestTimeout(30 * time.Second)

    // Handle timeout errors specifically
    c.OnError(func(r *colly.Response, err error) {
        if err != nil && strings.Contains(err.Error(), "timeout") {
            log.Printf("Timeout error for %s: %v", r.Request.URL, err)
            // Handle timeout-specific logic
        }
    })

    return c
}

Context-Based Timeout Control

func scrapeWithContext(urls []string, timeout time.Duration) {
    c := colly.NewCollector()

    ctx, cancel := context.WithTimeout(context.Background(), timeout)
    defer cancel()

    c.OnError(func(r *colly.Response, err error) {
        select {
        case <-ctx.Done():
            log.Printf("Scraping cancelled due to context timeout")
            return
        default:
            log.Printf("Error: %v", err)
        }
    })

    for _, url := range urls {
        select {
        case <-ctx.Done():
            log.Printf("Context cancelled, stopping scraper")
            return
        default:
            c.Visit(url)
        }
    }
}

4. Graceful Degradation Strategies

Implement fallback mechanisms when primary scraping methods fail.

Content Fallback Implementation

c.OnHTML("article", func(e *colly.HTMLElement) {
    title := e.ChildText("h1")
    if title == "" {
        // Fallback selectors
        title = e.ChildText(".title, .headline, h2")
    }

    content := e.ChildText(".content")
    if content == "" {
        // Try alternative content selectors
        content = e.ChildText(".article-body, .post-content, p")
    }

    if title == "" && content == "" {
        log.Printf("Warning: No content found for %s", e.Request.URL)
        return
    }

    // Process extracted data
    fmt.Printf("Title: %s\nContent: %s\n", title, content)
})

API Fallback Strategy

func scrapeWithFallback(url string) error {
    // Primary scraping attempt
    c := colly.NewCollector()

    var scraped bool
    var lastError error

    c.OnError(func(r *colly.Response, err error) {
        lastError = err
        log.Printf("Primary scraping failed: %v", err)
    })

    c.OnHTML("body", func(e *colly.HTMLElement) {
        scraped = true
        // Process content
    })

    c.Visit(url)

    // Fallback to API-based scraping service
    if !scraped && lastError != nil {
        log.Printf("Falling back to API service for %s", url)
        return scrapeWithAPI(url)
    }

    return lastError
}

func scrapeWithAPI(url string) error {
    // Implement API-based scraping as fallback
    // This could use a service like WebScraping.AI
    log.Printf("Using API fallback for %s", url)
    return nil
}

5. Logging and Monitoring

Comprehensive logging helps identify patterns and debug issues effectively.

Structured Logging Implementation

import (
    "encoding/json"
    "log"
    "os"
    "time"
)

type ErrorLog struct {
    Timestamp   time.Time `json:"timestamp"`
    URL         string    `json:"url"`
    StatusCode  int       `json:"status_code"`
    Error       string    `json:"error"`
    UserAgent   string    `json:"user_agent"`
    Retries     int       `json:"retries"`
}

func setupStructuredLogging(c *colly.Collector) {
    logFile, err := os.OpenFile("scraper_errors.log", 
        os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
    if err != nil {
        log.Fatalln("Failed to open log file:", err)
    }

    logger := log.New(logFile, "", 0)

    c.OnError(func(r *colly.Response, err error) {
        errorLog := ErrorLog{
            Timestamp:  time.Now(),
            URL:        r.Request.URL.String(),
            StatusCode: r.StatusCode,
            Error:      err.Error(),
            UserAgent:  r.Request.Headers.Get("User-Agent"),
            Retries:    getRetryCount(r.Request.URL.String()),
        }

        logJSON, _ := json.Marshal(errorLog)
        logger.Println(string(logJSON))
    })
}

6. Rate Limiting and Circuit Breaker

Implement protective measures to avoid overwhelming target servers.

Circuit Breaker Pattern

type CircuitBreaker struct {
    maxFailures int
    failures    int
    lastFailure time.Time
    timeout     time.Duration
}

func (cb *CircuitBreaker) CanProceed() bool {
    if cb.failures >= cb.maxFailures {
        if time.Since(cb.lastFailure) > cb.timeout {
            cb.failures = 0 // Reset circuit breaker
            return true
        }
        return false
    }
    return true
}

func (cb *CircuitBreaker) RecordFailure() {
    cb.failures++
    cb.lastFailure = time.Now()
}

func createProtectedCollector() *colly.Collector {
    c := colly.NewCollector()
    cb := &CircuitBreaker{
        maxFailures: 5,
        timeout:     5 * time.Minute,
    }

    c.OnRequest(func(r *colly.Request) {
        if !cb.CanProceed() {
            r.Abort()
            log.Printf("Circuit breaker open, aborting request to %s", r.URL)
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        cb.RecordFailure()
        log.Printf("Circuit breaker recorded failure: %v", err)
    })

    return c
}

7. Resource Management

Proper cleanup and resource management prevent memory leaks and connection exhaustion.

Connection Pool Management

func createOptimizedCollector() *colly.Collector {
    c := colly.NewCollector()

    // Configure connection pooling
    transport := &http.Transport{
        MaxIdleConns:        100,
        MaxIdleConnsPerHost: 10,
        IdleConnTimeout:     90 * time.Second,
    }

    c.WithTransport(transport)

    // Cleanup on errors
    c.OnError(func(r *colly.Response, err error) {
        if r.StatusCode == 0 {
            // Network error - might need to reset connections
            transport.CloseIdleConnections()
        }
    })

    return c
}

Conclusion

Effective error handling in Colly requires a multi-layered approach combining proper callback usage, retry logic, timeout management, and monitoring. By implementing these best practices, you can build resilient web scrapers that handle failures gracefully and provide reliable data extraction capabilities.

Remember to always respect target websites' resources and implement appropriate delays and rate limiting. For complex scenarios requiring JavaScript execution or advanced error handling, consider using browser automation tools or specialized scraping services that can provide additional reliability and features.

Table of contents