What are the best practices for error handling in Go scraping?
Error handling is crucial for building robust and reliable web scraping applications in Go. Proper error handling ensures your scrapers can gracefully handle network failures, server errors, parsing issues, and other unexpected conditions that commonly occur during web scraping operations.
1. Use Structured Error Types
Create custom error types that provide meaningful context about what went wrong and where:
package scraper
import (
"fmt"
"net/http"
)
// ScrapingError represents different types of scraping errors
type ScrapingError struct {
Type ErrorType
URL string
Message string
Cause error
}
type ErrorType int
const (
ErrorTypeNetwork ErrorType = iota
ErrorTypeHTTP
ErrorTypeParsing
ErrorTypeTimeout
ErrorTypeRateLimit
)
func (e *ScrapingError) Error() string {
return fmt.Sprintf("scraping error [%s]: %s (URL: %s)",
e.Type.String(), e.Message, e.URL)
}
func (e *ScrapingError) Unwrap() error {
return e.Cause
}
func (et ErrorType) String() string {
switch et {
case ErrorTypeNetwork:
return "NETWORK"
case ErrorTypeHTTP:
return "HTTP"
case ErrorTypeParsing:
return "PARSING"
case ErrorTypeTimeout:
return "TIMEOUT"
case ErrorTypeRateLimit:
return "RATE_LIMIT"
default:
return "UNKNOWN"
}
}
2. Implement Retry Logic with Exponential Backoff
Network requests can fail temporarily, so implementing intelligent retry mechanisms is essential:
package scraper
import (
"context"
"errors"
"fmt"
"math"
"net/http"
"time"
)
type RetryConfig struct {
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
BackoffFactor float64
RetryableErrors []ErrorType
}
func DefaultRetryConfig() RetryConfig {
return RetryConfig{
MaxRetries: 3,
BaseDelay: time.Second,
MaxDelay: 30 * time.Second,
BackoffFactor: 2.0,
RetryableErrors: []ErrorType{
ErrorTypeNetwork,
ErrorTypeTimeout,
ErrorTypeRateLimit,
},
}
}
func (s *Scraper) FetchWithRetry(ctx context.Context, url string, config RetryConfig) (*http.Response, error) {
var lastErr error
for attempt := 0; attempt <= config.MaxRetries; attempt++ {
if attempt > 0 {
delay := s.calculateBackoffDelay(attempt, config)
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(delay):
}
}
resp, err := s.makeRequest(ctx, url)
if err == nil {
return resp, nil
}
lastErr = err
// Check if error is retryable
if !s.isRetryableError(err, config.RetryableErrors) {
break
}
s.logger.Printf("Attempt %d failed for %s: %v", attempt+1, url, err)
}
return nil, fmt.Errorf("all retry attempts failed: %w", lastErr)
}
func (s *Scraper) calculateBackoffDelay(attempt int, config RetryConfig) time.Duration {
delay := float64(config.BaseDelay) * math.Pow(config.BackoffFactor, float64(attempt-1))
if delay > float64(config.MaxDelay) {
delay = float64(config.MaxDelay)
}
return time.Duration(delay)
}
func (s *Scraper) isRetryableError(err error, retryableTypes []ErrorType) bool {
var scrapingErr *ScrapingError
if !errors.As(err, &scrapingErr) {
return false
}
for _, errType := range retryableTypes {
if scrapingErr.Type == errType {
return true
}
}
return false
}
3. Handle HTTP Status Codes Appropriately
Different HTTP status codes require different handling strategies:
func (s *Scraper) handleHTTPResponse(resp *http.Response, url string) error {
switch {
case resp.StatusCode >= 200 && resp.StatusCode < 300:
return nil // Success
case resp.StatusCode == 429: // Too Many Requests
retryAfter := resp.Header.Get("Retry-After")
return &ScrapingError{
Type: ErrorTypeRateLimit,
URL: url,
Message: fmt.Sprintf("rate limited, retry after: %s", retryAfter),
}
case resp.StatusCode >= 400 && resp.StatusCode < 500:
// Client errors - usually not retryable
return &ScrapingError{
Type: ErrorTypeHTTP,
URL: url,
Message: fmt.Sprintf("client error: %d %s", resp.StatusCode, resp.Status),
}
case resp.StatusCode >= 500:
// Server errors - potentially retryable
return &ScrapingError{
Type: ErrorTypeHTTP,
URL: url,
Message: fmt.Sprintf("server error: %d %s", resp.StatusCode, resp.Status),
}
default:
return &ScrapingError{
Type: ErrorTypeHTTP,
URL: url,
Message: fmt.Sprintf("unexpected status: %d %s", resp.StatusCode, resp.Status),
}
}
}
4. Implement Circuit Breaker Pattern
Prevent cascading failures by implementing a circuit breaker that stops making requests to consistently failing endpoints:
package scraper
import (
"fmt"
"sync"
"time"
)
type CircuitState int
const (
StateClosed CircuitState = iota
StateOpen
StateHalfOpen
)
type CircuitBreaker struct {
mu sync.Mutex
state CircuitState
failures int
lastFailureTime time.Time
maxFailures int
timeout time.Duration
}
func NewCircuitBreaker(maxFailures int, timeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
maxFailures: maxFailures,
timeout: timeout,
state: StateClosed,
}
}
func (cb *CircuitBreaker) Call(fn func() error) error {
cb.mu.Lock()
defer cb.mu.Unlock()
if cb.state == StateOpen {
if time.Since(cb.lastFailureTime) > cb.timeout {
cb.state = StateHalfOpen
} else {
return fmt.Errorf("circuit breaker is open")
}
}
err := fn()
if err != nil {
cb.onFailure()
return err
}
cb.onSuccess()
return nil
}
func (cb *CircuitBreaker) onFailure() {
cb.failures++
cb.lastFailureTime = time.Now()
if cb.failures >= cb.maxFailures {
cb.state = StateOpen
}
}
func (cb *CircuitBreaker) onSuccess() {
cb.failures = 0
cb.state = StateClosed
}
5. Use Context for Timeout and Cancellation
Always use context to handle timeouts and provide cancellation capabilities:
func (s *Scraper) ScrapeWithTimeout(url string, timeout time.Duration) (*ScrapedData, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
return s.ScrapeWithContext(ctx, url)
}
func (s *Scraper) ScrapeWithContext(ctx context.Context, url string) (*ScrapedData, error) {
// Create request with context
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, &ScrapingError{
Type: ErrorTypeNetwork,
URL: url,
Message: "failed to create request",
Cause: err,
}
}
// Make request
resp, err := s.client.Do(req)
if err != nil {
// Check if context was cancelled or timed out
if ctx.Err() != nil {
return nil, &ScrapingError{
Type: ErrorTypeTimeout,
URL: url,
Message: "request cancelled or timed out",
Cause: ctx.Err(),
}
}
return nil, &ScrapingError{
Type: ErrorTypeNetwork,
URL: url,
Message: "request failed",
Cause: err,
}
}
defer resp.Body.Close()
// Handle response...
return s.parseResponse(resp, url)
}
6. Implement Graceful Error Recovery
Build mechanisms to recover from errors and continue processing:
type ScrapingJob struct {
URLs []string
Results chan ScrapedData
Errors chan error
maxWorkers int
}
func (job *ScrapingJob) Run(ctx context.Context) error {
semaphore := make(chan struct{}, job.maxWorkers)
var wg sync.WaitGroup
for _, url := range job.URLs {
select {
case <-ctx.Done():
return ctx.Err()
case semaphore <- struct{}{}:
}
wg.Add(1)
go func(url string) {
defer wg.Done()
defer func() { <-semaphore }()
data, err := job.scrapeURL(ctx, url)
if err != nil {
// Log error but don't stop processing other URLs
job.Errors <- fmt.Errorf("failed to scrape %s: %w", url, err)
return
}
job.Results <- data
}(url)
}
go func() {
wg.Wait()
close(job.Results)
close(job.Errors)
}()
return nil
}
7. Monitor and Log Errors Effectively
Implement comprehensive logging and monitoring for error analysis:
package scraper
import (
"log"
"sync"
"time"
)
type ErrorStats struct {
mu sync.RWMutex
errorCounts map[ErrorType]int
lastErrors []ErrorRecord
maxRecords int
}
type ErrorRecord struct {
Timestamp time.Time
URL string
Error error
Type ErrorType
}
func NewErrorStats(maxRecords int) *ErrorStats {
return &ErrorStats{
errorCounts: make(map[ErrorType]int),
lastErrors: make([]ErrorRecord, 0, maxRecords),
maxRecords: maxRecords,
}
}
func (es *ErrorStats) RecordError(url string, err error) {
es.mu.Lock()
defer es.mu.Unlock()
var errType ErrorType = ErrorTypeNetwork
if scrapingErr, ok := err.(*ScrapingError); ok {
errType = scrapingErr.Type
}
es.errorCounts[errType]++
record := ErrorRecord{
Timestamp: time.Now(),
URL: url,
Error: err,
Type: errType,
}
es.lastErrors = append(es.lastErrors, record)
if len(es.lastErrors) > es.maxRecords {
es.lastErrors = es.lastErrors[1:]
}
// Log the error
log.Printf("Scraping error [%s] for %s: %v", errType, url, err)
}
func (es *ErrorStats) GetStats() map[ErrorType]int {
es.mu.RLock()
defer es.mu.RUnlock()
stats := make(map[ErrorType]int)
for k, v := range es.errorCounts {
stats[k] = v
}
return stats
}
8. Handle Common Error Scenarios
Network Connectivity Issues
func (s *Scraper) handleNetworkError(err error, url string) *ScrapingError {
if netErr, ok := err.(net.Error); ok {
if netErr.Timeout() {
return &ScrapingError{
Type: ErrorTypeTimeout,
URL: url,
Message: "network timeout",
Cause: err,
}
}
}
return &ScrapingError{
Type: ErrorTypeNetwork,
URL: url,
Message: "network connectivity issue",
Cause: err,
}
}
Rate Limiting Response
func (s *Scraper) handleRateLimit(resp *http.Response, url string) error {
retryAfter := resp.Header.Get("Retry-After")
if retryAfter == "" {
retryAfter = "unknown"
}
return &ScrapingError{
Type: ErrorTypeRateLimit,
URL: url,
Message: fmt.Sprintf("rate limited, retry after: %s", retryAfter),
}
}
Content Parsing Errors
func (s *Scraper) safeParseHTML(content []byte, url string) (*html.Node, error) {
defer func() {
if r := recover(); r != nil {
log.Printf("Panic while parsing HTML for %s: %v", url, r)
}
}()
doc, err := html.Parse(bytes.NewReader(content))
if err != nil {
return nil, &ScrapingError{
Type: ErrorTypeParsing,
URL: url,
Message: "failed to parse HTML content",
Cause: err,
}
}
return doc, nil
}
9. Error Handling Best Practices Summary
Error Classification Strategy
- Network Errors: Connection timeouts, DNS failures - usually retryable
- HTTP Errors: 4xx (client errors) typically not retryable, 5xx (server errors) often retryable
- Parsing Errors: Usually indicate site structure changes - log and skip
- Rate Limiting: Respect retry-after headers and implement proper backoff
Retry Implementation Guidelines
- Use exponential backoff with jitter to avoid thundering herd problems
- Set maximum retry limits to prevent infinite loops
- Only retry appropriate error types based on context
- Consider implementing timeout handling similar to browser automation tools
Resource Management
- Always use context with timeouts for HTTP requests
- Properly close HTTP response bodies to prevent resource leaks
- Implement connection pooling and reasonable connection limits
- Monitor memory usage during large-scale scraping operations
Monitoring and Observability
- Track error rates by type and URL pattern
- Set up alerts for unusual error patterns or spikes
- Log sufficient context for debugging without exposing sensitive data
- Implement health checks and status endpoints for scraping services
Production Considerations
- Implement graceful shutdown mechanisms similar to browser session handling
- Use structured logging for better error analysis
- Consider implementing distributed tracing for complex scraping workflows
- Set up proper monitoring dashboards for error tracking
By following these comprehensive error handling practices, your Go web scraping applications will be more resilient, maintainable, and capable of handling the unpredictable nature of web scraping at scale. Remember to always test your error handling logic thoroughly and monitor your scrapers in production to quickly identify and address issues before they impact your data collection processes.