Table of contents

How do I implement logging in Go web scraping projects?

Effective logging is crucial for debugging, monitoring, and maintaining Go web scraping applications. Whether you're tracking HTTP requests, debugging parsing errors, or monitoring performance metrics, proper logging helps you understand your scraper's behavior and troubleshoot issues efficiently.

Built-in log/slog Package (Go 1.21+)

Go 1.21 introduced the structured logging package log/slog, which provides excellent performance and built-in JSON output:

package main

import (
    "context"
    "fmt"
    "log/slog"
    "net/http"
    "os"
    "time"
)

type Scraper struct {
    logger *slog.Logger
    client *http.Client
}

func NewScraper() *Scraper {
    // Configure structured logger with JSON output
    logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
        Level: slog.LevelDebug,
        AddSource: true,
    }))

    return &Scraper{
        logger: logger,
        client: &http.Client{
            Timeout: 30 * time.Second,
        },
    }
}

func (s *Scraper) FetchPage(url string) ([]byte, error) {
    start := time.Now()

    s.logger.Info("Starting page fetch",
        slog.String("url", url),
        slog.Time("timestamp", start),
    )

    req, err := http.NewRequest("GET", url, nil)
    if err != nil {
        s.logger.Error("Failed to create request",
            slog.String("url", url),
            slog.String("error", err.Error()),
        )
        return nil, err
    }

    // Add request headers for logging
    req.Header.Set("User-Agent", "GoScraper/1.0")

    resp, err := s.client.Do(req)
    if err != nil {
        s.logger.Error("HTTP request failed",
            slog.String("url", url),
            slog.String("error", err.Error()),
            slog.Duration("duration", time.Since(start)),
        )
        return nil, err
    }
    defer resp.Body.Close()

    // Log response details
    s.logger.Info("Page fetched successfully",
        slog.String("url", url),
        slog.Int("status_code", resp.StatusCode),
        slog.String("content_type", resp.Header.Get("Content-Type")),
        slog.Int64("content_length", resp.ContentLength),
        slog.Duration("duration", time.Since(start)),
    )

    body := make([]byte, resp.ContentLength)
    _, err = resp.Body.Read(body)
    if err != nil {
        s.logger.Warn("Failed to read response body completely",
            slog.String("url", url),
            slog.String("error", err.Error()),
        )
    }

    return body, nil
}

Using Logrus for Advanced Features

Logrus provides more features like hooks, formatters, and log rotation:

package main

import (
    "fmt"
    "net/http"
    "time"

    "github.com/sirupsen/logrus"
    "gopkg.in/natefinch/lumberjack.v2"
)

type ScraperWithLogrus struct {
    logger *logrus.Logger
    client *http.Client
}

func NewScraperWithLogrus() *ScraperWithLogrus {
    logger := logrus.New()

    // Configure log rotation
    logger.SetOutput(&lumberjack.Logger{
        Filename:   "scraper.log",
        MaxSize:    100, // MB
        MaxBackups: 3,
        MaxAge:     28, // days
        Compress:   true,
    })

    // Use JSON formatter for structured logs
    logger.SetFormatter(&logrus.JSONFormatter{
        TimestampFormat: time.RFC3339,
    })

    logger.SetLevel(logrus.DebugLevel)

    return &ScraperWithLogrus{
        logger: logger,
        client: &http.Client{Timeout: 30 * time.Second},
    }
}

func (s *ScraperWithLogrus) ScrapeWithRetry(url string, maxRetries int) ([]byte, error) {
    for attempt := 1; attempt <= maxRetries; attempt++ {
        s.logger.WithFields(logrus.Fields{
            "url":         url,
            "attempt":     attempt,
            "max_retries": maxRetries,
        }).Info("Starting scrape attempt")

        body, err := s.fetchPage(url)
        if err == nil {
            s.logger.WithFields(logrus.Fields{
                "url":     url,
                "attempt": attempt,
                "success": true,
            }).Info("Scrape completed successfully")
            return body, nil
        }

        s.logger.WithFields(logrus.Fields{
            "url":     url,
            "attempt": attempt,
            "error":   err.Error(),
        }).Warn("Scrape attempt failed")

        if attempt < maxRetries {
            backoff := time.Duration(attempt) * time.Second
            s.logger.WithFields(logrus.Fields{
                "url":             url,
                "backoff_seconds": backoff.Seconds(),
            }).Info("Waiting before retry")
            time.Sleep(backoff)
        }
    }

    s.logger.WithFields(logrus.Fields{
        "url":      url,
        "attempts": maxRetries,
    }).Error("All scrape attempts failed")

    return nil, fmt.Errorf("failed to scrape %s after %d attempts", url, maxRetries)
}

func (s *ScraperWithLogrus) fetchPage(url string) ([]byte, error) {
    // Implementation similar to previous example
    resp, err := s.client.Get(url)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    body := make([]byte, 1024)
    _, err = resp.Body.Read(body)
    return body, err
}

High-Performance Logging with Zap

For high-throughput scraping applications, Zap provides excellent performance:

package main

import (
    "net/http"
    "time"

    "go.uber.org/zap"
    "go.uber.org/zap/zapcore"
)

type HighPerformanceScraper struct {
    logger *zap.Logger
    client *http.Client
}

func NewHighPerformanceScraper() (*HighPerformanceScraper, error) {
    // Configure high-performance logger
    config := zap.Config{
        Level:       zap.NewAtomicLevelAt(zap.InfoLevel),
        Development: false,
        Sampling: &zap.SamplingConfig{
            Initial:    100,
            Thereafter: 100,
        },
        Encoding: "json",
        EncoderConfig: zapcore.EncoderConfig{
            TimeKey:        "timestamp",
            LevelKey:       "level",
            NameKey:        "logger",
            CallerKey:      "caller",
            MessageKey:     "msg",
            StacktraceKey:  "stacktrace",
            LineEnding:     zapcore.DefaultLineEnding,
            EncodeLevel:    zapcore.LowercaseLevelEncoder,
            EncodeTime:     zapcore.ISO8601TimeEncoder,
            EncodeDuration: zapcore.SecondsDurationEncoder,
            EncodeCaller:   zapcore.ShortCallerEncoder,
        },
        OutputPaths:      []string{"stdout", "scraper.log"},
        ErrorOutputPaths: []string{"stderr"},
    }

    logger, err := config.Build()
    if err != nil {
        return nil, err
    }

    return &HighPerformanceScraper{
        logger: logger,
        client: &http.Client{Timeout: 30 * time.Second},
    }, nil
}

func (s *HighPerformanceScraper) ScrapeConcurrently(urls []string, concurrency int) {
    semaphore := make(chan struct{}, concurrency)

    s.logger.Info("Starting concurrent scraping",
        zap.Int("total_urls", len(urls)),
        zap.Int("concurrency", concurrency),
    )

    for _, url := range urls {
        go func(u string) {
            semaphore <- struct{}{} // Acquire
            defer func() { <-semaphore }() // Release

            start := time.Now()
            err := s.scrapeURL(u)
            duration := time.Since(start)

            if err != nil {
                s.logger.Error("Scraping failed",
                    zap.String("url", u),
                    zap.Error(err),
                    zap.Duration("duration", duration),
                )
            } else {
                s.logger.Info("Scraping completed",
                    zap.String("url", u),
                    zap.Duration("duration", duration),
                )
            }
        }(url)
    }
}

func (s *HighPerformanceScraper) scrapeURL(url string) error {
    resp, err := s.client.Get(url)
    if err != nil {
        return err
    }
    defer resp.Body.Close()
    return nil
}

// Don't forget to flush logs on shutdown
func (s *HighPerformanceScraper) Close() {
    s.logger.Sync()
}

Logging Best Practices for Web Scraping

1. Request/Response Logging

Always log essential HTTP request and response information:

func (s *Scraper) logHTTPTransaction(req *http.Request, resp *http.Response, duration time.Duration, err error) {
    if err != nil {
        s.logger.Error("HTTP request failed",
            slog.String("method", req.Method),
            slog.String("url", req.URL.String()),
            slog.String("user_agent", req.Header.Get("User-Agent")),
            slog.Duration("duration", duration),
            slog.String("error", err.Error()),
        )
    } else {
        s.logger.Info("HTTP request completed",
            slog.String("method", req.Method),
            slog.String("url", req.URL.String()),
            slog.Int("status_code", resp.StatusCode),
            slog.Int64("content_length", resp.ContentLength),
            slog.String("content_type", resp.Header.Get("Content-Type")),
            slog.Duration("duration", duration),
        )
    }
}

2. Rate Limiting and Delay Logging

Track rate limiting and delays to monitor scraping behavior:

type RateLimitedScraper struct {
    logger      *slog.Logger
    lastRequest time.Time
    minDelay    time.Duration
}

func (s *RateLimitedScraper) FetchWithRateLimit(url string) error {
    // Calculate required delay
    elapsed := time.Since(s.lastRequest)
    if elapsed < s.minDelay {
        delay := s.minDelay - elapsed
        s.logger.Info("Rate limiting delay",
            slog.Duration("delay", delay),
            slog.String("url", url),
        )
        time.Sleep(delay)
    }

    s.lastRequest = time.Now()
    // Proceed with request...
    return nil
}

3. Error Context and Recovery

Provide detailed error context for debugging:

func (s *Scraper) ParseHTML(html []byte, url string) (data map[string]string, err error) {
    defer func() {
        if r := recover(); r != nil {
            s.logger.Error("HTML parsing panic recovered",
                slog.String("url", url),
                slog.Int("html_size", len(html)),
                slog.Any("panic", r),
            )
            err = fmt.Errorf("parsing panic: %v", r)
        }
    }()

    // HTML parsing logic here
    data = make(map[string]string)

    s.logger.Debug("HTML parsing completed",
        slog.String("url", url),
        slog.Int("fields_extracted", len(data)),
    )

    return data, nil
}

Monitoring and Metrics

Combine logging with metrics for comprehensive monitoring:

type MetricsCollector struct {
    logger        *slog.Logger
    requestCount  int64
    errorCount    int64
    totalDuration time.Duration
}

func (m *MetricsCollector) LogMetrics() {
    avgDuration := time.Duration(0)
    if m.requestCount > 0 {
        avgDuration = m.totalDuration / time.Duration(m.requestCount)
    }

    errorRate := float64(m.errorCount) / float64(m.requestCount) * 100

    m.logger.Info("Scraping metrics",
        slog.Int64("total_requests", m.requestCount),
        slog.Int64("total_errors", m.errorCount),
        slog.Float64("error_rate_percent", errorRate),
        slog.Duration("average_duration", avgDuration),
    )
}

Context-Aware Logging

Use Go's context package for tracing requests across your application:

func (s *Scraper) FetchWithContext(ctx context.Context, url string) ([]byte, error) {
    // Extract request ID from context for correlation
    requestID := ctx.Value("request_id")

    s.logger.Info("Starting fetch with context",
        slog.String("url", url),
        slog.Any("request_id", requestID),
    )

    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        s.logger.Error("Failed to create request",
            slog.String("url", url),
            slog.Any("request_id", requestID),
            slog.String("error", err.Error()),
        )
        return nil, err
    }

    resp, err := s.client.Do(req)
    if err != nil {
        s.logger.Error("Request failed",
            slog.String("url", url),
            slog.Any("request_id", requestID),
            slog.String("error", err.Error()),
        )
        return nil, err
    }
    defer resp.Body.Close()

    // Read and return response
    body := make([]byte, 1024)
    _, err = resp.Body.Read(body)
    return body, err
}

Configuration and Environment Setup

Set up logging configuration that adapts to different environments:

func SetupLogger(env string) *slog.Logger {
    var handler slog.Handler

    opts := &slog.HandlerOptions{
        AddSource: true,
    }

    switch env {
    case "development":
        opts.Level = slog.LevelDebug
        handler = slog.NewTextHandler(os.Stdout, opts)
    case "production":
        opts.Level = slog.LevelInfo
        handler = slog.NewJSONHandler(os.Stdout, opts)
    default:
        opts.Level = slog.LevelWarn
        handler = slog.NewJSONHandler(os.Stdout, opts)
    }

    return slog.New(handler)
}

// Usage example
func main() {
    env := os.Getenv("ENVIRONMENT")
    if env == "" {
        env = "development"
    }

    logger := SetupLogger(env)
    scraper := &Scraper{logger: logger}
    // Use scraper...
}

Log Sampling and Performance

For high-volume scrapers, implement log sampling to reduce overhead:

type SampledLogger struct {
    logger     *slog.Logger
    sampleRate int
    counter    int64
}

func NewSampledLogger(logger *slog.Logger, sampleRate int) *SampledLogger {
    return &SampledLogger{
        logger:     logger,
        sampleRate: sampleRate,
    }
}

func (s *SampledLogger) LogIfSampled(level slog.Level, msg string, args ...any) {
    s.counter++
    if s.counter%int64(s.sampleRate) == 0 {
        s.logger.Log(context.Background(), level, msg, args...)
    }
}

Integration with External Services

For production deployments, consider integrating with logging services:

# Install required packages
go get go.uber.org/zap
go get github.com/sirupsen/logrus
go get gopkg.in/natefinch/lumberjack.v2

# For structured logging analysis
go get github.com/elastic/go-elasticsearch/v8

Example integration with structured logging for monitoring:

type StructuredScraper struct {
    logger     *slog.Logger
    client     *http.Client
    sessionID  string
}

func (s *StructuredScraper) LogScrapingSession(urls []string, results []ScrapingResult) {
    s.logger.Info("Scraping session completed",
        slog.String("session_id", s.sessionID),
        slog.Int("urls_requested", len(urls)),
        slog.Int("successful_scrapes", countSuccessful(results)),
        slog.Int("failed_scrapes", countFailed(results)),
        slog.Time("session_end", time.Now()),
    )
}

type ScrapingResult struct {
    URL     string
    Success bool
    Error   error
}

func countSuccessful(results []ScrapingResult) int {
    count := 0
    for _, r := range results {
        if r.Success {
            count++
        }
    }
    return count
}

func countFailed(results []ScrapingResult) int {
    count := 0
    for _, r := range results {
        if !r.Success {
            count++
        }
    }
    return count
}

Effective logging in Go web scraping projects enables you to monitor performance, debug issues, and maintain reliable scrapers. Whether you choose the built-in log/slog package for simplicity, Logrus for features, or Zap for performance, consistent structured logging will significantly improve your scraping application's maintainability and observability.

Remember to balance logging verbosity with performance requirements, especially in high-throughput scraping scenarios, and always ensure sensitive data is properly sanitized before logging.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon