What are the best practices for error handling in Colly?
Error handling is crucial for building robust web scrapers with Colly, the popular Go web scraping framework. Proper error management ensures your scrapers can handle network failures, timeouts, HTTP errors, and parsing issues gracefully. This guide covers comprehensive strategies for implementing effective error handling in your Colly applications.
Understanding Colly's Error Types
Colly encounters various types of errors during web scraping operations:
- Network errors: Connection timeouts, DNS resolution failures, network unreachability
- HTTP errors: 4xx client errors, 5xx server errors, redirect loops
- Parsing errors: Invalid HTML, encoding issues, malformed content
- Application errors: Custom validation failures, data processing errors
1. Using OnError Callbacks
The most fundamental error handling mechanism in Colly is the OnError
callback, which catches all errors that occur during request processing.
Basic OnError Implementation
package main
import (
"fmt"
"log"
"net/http"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Handle all errors
c.OnError(func(r *colly.Response, err error) {
log.Printf("Error occurred: %v", err)
log.Printf("Request URL: %s", r.Request.URL)
log.Printf("Status Code: %d", r.StatusCode)
// Log response body for debugging (limit size)
if len(r.Body) < 1000 {
log.Printf("Response body: %s", string(r.Body))
}
})
c.Visit("https://example.com")
}
Advanced Error Classification
c.OnError(func(r *colly.Response, err error) {
switch {
case r.StatusCode >= 500:
log.Printf("Server error (%d): %v", r.StatusCode, err)
// Implement retry logic for server errors
case r.StatusCode >= 400:
log.Printf("Client error (%d): %v", r.StatusCode, err)
// Log and skip for client errors
case r.StatusCode == 0:
log.Printf("Network error: %v", err)
// Handle network connectivity issues
default:
log.Printf("Unknown error: %v", err)
}
})
2. Implementing Retry Logic
Robust error handling often requires retry mechanisms for transient failures.
Simple Retry Implementation
package main
import (
"fmt"
"log"
"time"
"github.com/gocolly/colly/v2"
)
func createCollectorWithRetry(maxRetries int) *colly.Collector {
c := colly.NewCollector()
retryCount := make(map[string]int)
c.OnError(func(r *colly.Response, err error) {
url := r.Request.URL.String()
retryCount[url]++
if retryCount[url] <= maxRetries {
log.Printf("Retrying %s (attempt %d/%d): %v",
url, retryCount[url], maxRetries, err)
// Exponential backoff
time.Sleep(time.Duration(retryCount[url]) * time.Second)
// Retry the request
r.Request.Retry()
} else {
log.Printf("Max retries exceeded for %s: %v", url, err)
delete(retryCount, url) // Clean up
}
})
return c
}
Conditional Retry Logic
c.OnError(func(r *colly.Response, err error) {
url := r.Request.URL.String()
// Only retry on specific conditions
shouldRetry := func(statusCode int, err error) bool {
// Retry on server errors and timeouts
if statusCode >= 500 || statusCode == 0 {
return true
}
// Retry on specific error types
if err != nil {
errStr := err.Error()
return strings.Contains(errStr, "timeout") ||
strings.Contains(errStr, "connection reset")
}
return false
}
if shouldRetry(r.StatusCode, err) && retryCount[url] < maxRetries {
// Implement retry with backoff
time.Sleep(time.Duration(retryCount[url]+1) * 2 * time.Second)
r.Request.Retry()
retryCount[url]++
}
})
3. Timeout Management
Proper timeout configuration prevents scrapers from hanging indefinitely.
Setting Multiple Timeout Types
import (
"context"
"net/http"
"time"
"github.com/gocolly/colly/v2"
)
func createCollectorWithTimeouts() *colly.Collector {
c := colly.NewCollector()
// Configure HTTP client with timeouts
c.WithTransport(&http.Transport{
TLSHandshakeTimeout: 10 * time.Second,
ResponseHeaderTimeout: 10 * time.Second,
IdleConnTimeout: 30 * time.Second,
})
// Set overall request timeout
c.SetRequestTimeout(30 * time.Second)
// Handle timeout errors specifically
c.OnError(func(r *colly.Response, err error) {
if err != nil && strings.Contains(err.Error(), "timeout") {
log.Printf("Timeout error for %s: %v", r.Request.URL, err)
// Handle timeout-specific logic
}
})
return c
}
Context-Based Timeout Control
func scrapeWithContext(urls []string, timeout time.Duration) {
c := colly.NewCollector()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
c.OnError(func(r *colly.Response, err error) {
select {
case <-ctx.Done():
log.Printf("Scraping cancelled due to context timeout")
return
default:
log.Printf("Error: %v", err)
}
})
for _, url := range urls {
select {
case <-ctx.Done():
log.Printf("Context cancelled, stopping scraper")
return
default:
c.Visit(url)
}
}
}
4. Graceful Degradation Strategies
Implement fallback mechanisms when primary scraping methods fail.
Content Fallback Implementation
c.OnHTML("article", func(e *colly.HTMLElement) {
title := e.ChildText("h1")
if title == "" {
// Fallback selectors
title = e.ChildText(".title, .headline, h2")
}
content := e.ChildText(".content")
if content == "" {
// Try alternative content selectors
content = e.ChildText(".article-body, .post-content, p")
}
if title == "" && content == "" {
log.Printf("Warning: No content found for %s", e.Request.URL)
return
}
// Process extracted data
fmt.Printf("Title: %s\nContent: %s\n", title, content)
})
API Fallback Strategy
func scrapeWithFallback(url string) error {
// Primary scraping attempt
c := colly.NewCollector()
var scraped bool
var lastError error
c.OnError(func(r *colly.Response, err error) {
lastError = err
log.Printf("Primary scraping failed: %v", err)
})
c.OnHTML("body", func(e *colly.HTMLElement) {
scraped = true
// Process content
})
c.Visit(url)
// Fallback to API-based scraping service
if !scraped && lastError != nil {
log.Printf("Falling back to API service for %s", url)
return scrapeWithAPI(url)
}
return lastError
}
func scrapeWithAPI(url string) error {
// Implement API-based scraping as fallback
// This could use a service like WebScraping.AI
log.Printf("Using API fallback for %s", url)
return nil
}
5. Logging and Monitoring
Comprehensive logging helps identify patterns and debug issues effectively.
Structured Logging Implementation
import (
"encoding/json"
"log"
"os"
"time"
)
type ErrorLog struct {
Timestamp time.Time `json:"timestamp"`
URL string `json:"url"`
StatusCode int `json:"status_code"`
Error string `json:"error"`
UserAgent string `json:"user_agent"`
Retries int `json:"retries"`
}
func setupStructuredLogging(c *colly.Collector) {
logFile, err := os.OpenFile("scraper_errors.log",
os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
if err != nil {
log.Fatalln("Failed to open log file:", err)
}
logger := log.New(logFile, "", 0)
c.OnError(func(r *colly.Response, err error) {
errorLog := ErrorLog{
Timestamp: time.Now(),
URL: r.Request.URL.String(),
StatusCode: r.StatusCode,
Error: err.Error(),
UserAgent: r.Request.Headers.Get("User-Agent"),
Retries: getRetryCount(r.Request.URL.String()),
}
logJSON, _ := json.Marshal(errorLog)
logger.Println(string(logJSON))
})
}
6. Rate Limiting and Circuit Breaker
Implement protective measures to avoid overwhelming target servers.
Circuit Breaker Pattern
type CircuitBreaker struct {
maxFailures int
failures int
lastFailure time.Time
timeout time.Duration
}
func (cb *CircuitBreaker) CanProceed() bool {
if cb.failures >= cb.maxFailures {
if time.Since(cb.lastFailure) > cb.timeout {
cb.failures = 0 // Reset circuit breaker
return true
}
return false
}
return true
}
func (cb *CircuitBreaker) RecordFailure() {
cb.failures++
cb.lastFailure = time.Now()
}
func createProtectedCollector() *colly.Collector {
c := colly.NewCollector()
cb := &CircuitBreaker{
maxFailures: 5,
timeout: 5 * time.Minute,
}
c.OnRequest(func(r *colly.Request) {
if !cb.CanProceed() {
r.Abort()
log.Printf("Circuit breaker open, aborting request to %s", r.URL)
}
})
c.OnError(func(r *colly.Response, err error) {
cb.RecordFailure()
log.Printf("Circuit breaker recorded failure: %v", err)
})
return c
}
7. Resource Management
Proper cleanup and resource management prevent memory leaks and connection exhaustion.
Connection Pool Management
func createOptimizedCollector() *colly.Collector {
c := colly.NewCollector()
// Configure connection pooling
transport := &http.Transport{
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
}
c.WithTransport(transport)
// Cleanup on errors
c.OnError(func(r *colly.Response, err error) {
if r.StatusCode == 0 {
// Network error - might need to reset connections
transport.CloseIdleConnections()
}
})
return c
}
Conclusion
Effective error handling in Colly requires a multi-layered approach combining proper callback usage, retry logic, timeout management, and monitoring. By implementing these best practices, you can build resilient web scrapers that handle failures gracefully and provide reliable data extraction capabilities.
Remember to always respect target websites' resources and implement appropriate delays and rate limiting. For complex scenarios requiring JavaScript execution or advanced error handling, consider using browser automation tools or specialized scraping services that can provide additional reliability and features.