How do I implement logging in Go web scraping projects?
Effective logging is crucial for debugging, monitoring, and maintaining Go web scraping applications. Whether you're tracking HTTP requests, debugging parsing errors, or monitoring performance metrics, proper logging helps you understand your scraper's behavior and troubleshoot issues efficiently.
Built-in log/slog Package (Go 1.21+)
Go 1.21 introduced the structured logging package log/slog
, which provides excellent performance and built-in JSON output:
package main
import (
"context"
"fmt"
"log/slog"
"net/http"
"os"
"time"
)
type Scraper struct {
logger *slog.Logger
client *http.Client
}
func NewScraper() *Scraper {
// Configure structured logger with JSON output
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelDebug,
AddSource: true,
}))
return &Scraper{
logger: logger,
client: &http.Client{
Timeout: 30 * time.Second,
},
}
}
func (s *Scraper) FetchPage(url string) ([]byte, error) {
start := time.Now()
s.logger.Info("Starting page fetch",
slog.String("url", url),
slog.Time("timestamp", start),
)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
s.logger.Error("Failed to create request",
slog.String("url", url),
slog.String("error", err.Error()),
)
return nil, err
}
// Add request headers for logging
req.Header.Set("User-Agent", "GoScraper/1.0")
resp, err := s.client.Do(req)
if err != nil {
s.logger.Error("HTTP request failed",
slog.String("url", url),
slog.String("error", err.Error()),
slog.Duration("duration", time.Since(start)),
)
return nil, err
}
defer resp.Body.Close()
// Log response details
s.logger.Info("Page fetched successfully",
slog.String("url", url),
slog.Int("status_code", resp.StatusCode),
slog.String("content_type", resp.Header.Get("Content-Type")),
slog.Int64("content_length", resp.ContentLength),
slog.Duration("duration", time.Since(start)),
)
body := make([]byte, resp.ContentLength)
_, err = resp.Body.Read(body)
if err != nil {
s.logger.Warn("Failed to read response body completely",
slog.String("url", url),
slog.String("error", err.Error()),
)
}
return body, nil
}
Using Logrus for Advanced Features
Logrus provides more features like hooks, formatters, and log rotation:
package main
import (
"fmt"
"net/http"
"time"
"github.com/sirupsen/logrus"
"gopkg.in/natefinch/lumberjack.v2"
)
type ScraperWithLogrus struct {
logger *logrus.Logger
client *http.Client
}
func NewScraperWithLogrus() *ScraperWithLogrus {
logger := logrus.New()
// Configure log rotation
logger.SetOutput(&lumberjack.Logger{
Filename: "scraper.log",
MaxSize: 100, // MB
MaxBackups: 3,
MaxAge: 28, // days
Compress: true,
})
// Use JSON formatter for structured logs
logger.SetFormatter(&logrus.JSONFormatter{
TimestampFormat: time.RFC3339,
})
logger.SetLevel(logrus.DebugLevel)
return &ScraperWithLogrus{
logger: logger,
client: &http.Client{Timeout: 30 * time.Second},
}
}
func (s *ScraperWithLogrus) ScrapeWithRetry(url string, maxRetries int) ([]byte, error) {
for attempt := 1; attempt <= maxRetries; attempt++ {
s.logger.WithFields(logrus.Fields{
"url": url,
"attempt": attempt,
"max_retries": maxRetries,
}).Info("Starting scrape attempt")
body, err := s.fetchPage(url)
if err == nil {
s.logger.WithFields(logrus.Fields{
"url": url,
"attempt": attempt,
"success": true,
}).Info("Scrape completed successfully")
return body, nil
}
s.logger.WithFields(logrus.Fields{
"url": url,
"attempt": attempt,
"error": err.Error(),
}).Warn("Scrape attempt failed")
if attempt < maxRetries {
backoff := time.Duration(attempt) * time.Second
s.logger.WithFields(logrus.Fields{
"url": url,
"backoff_seconds": backoff.Seconds(),
}).Info("Waiting before retry")
time.Sleep(backoff)
}
}
s.logger.WithFields(logrus.Fields{
"url": url,
"attempts": maxRetries,
}).Error("All scrape attempts failed")
return nil, fmt.Errorf("failed to scrape %s after %d attempts", url, maxRetries)
}
func (s *ScraperWithLogrus) fetchPage(url string) ([]byte, error) {
// Implementation similar to previous example
resp, err := s.client.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body := make([]byte, 1024)
_, err = resp.Body.Read(body)
return body, err
}
High-Performance Logging with Zap
For high-throughput scraping applications, Zap provides excellent performance:
package main
import (
"net/http"
"time"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
type HighPerformanceScraper struct {
logger *zap.Logger
client *http.Client
}
func NewHighPerformanceScraper() (*HighPerformanceScraper, error) {
// Configure high-performance logger
config := zap.Config{
Level: zap.NewAtomicLevelAt(zap.InfoLevel),
Development: false,
Sampling: &zap.SamplingConfig{
Initial: 100,
Thereafter: 100,
},
Encoding: "json",
EncoderConfig: zapcore.EncoderConfig{
TimeKey: "timestamp",
LevelKey: "level",
NameKey: "logger",
CallerKey: "caller",
MessageKey: "msg",
StacktraceKey: "stacktrace",
LineEnding: zapcore.DefaultLineEnding,
EncodeLevel: zapcore.LowercaseLevelEncoder,
EncodeTime: zapcore.ISO8601TimeEncoder,
EncodeDuration: zapcore.SecondsDurationEncoder,
EncodeCaller: zapcore.ShortCallerEncoder,
},
OutputPaths: []string{"stdout", "scraper.log"},
ErrorOutputPaths: []string{"stderr"},
}
logger, err := config.Build()
if err != nil {
return nil, err
}
return &HighPerformanceScraper{
logger: logger,
client: &http.Client{Timeout: 30 * time.Second},
}, nil
}
func (s *HighPerformanceScraper) ScrapeConcurrently(urls []string, concurrency int) {
semaphore := make(chan struct{}, concurrency)
s.logger.Info("Starting concurrent scraping",
zap.Int("total_urls", len(urls)),
zap.Int("concurrency", concurrency),
)
for _, url := range urls {
go func(u string) {
semaphore <- struct{}{} // Acquire
defer func() { <-semaphore }() // Release
start := time.Now()
err := s.scrapeURL(u)
duration := time.Since(start)
if err != nil {
s.logger.Error("Scraping failed",
zap.String("url", u),
zap.Error(err),
zap.Duration("duration", duration),
)
} else {
s.logger.Info("Scraping completed",
zap.String("url", u),
zap.Duration("duration", duration),
)
}
}(url)
}
}
func (s *HighPerformanceScraper) scrapeURL(url string) error {
resp, err := s.client.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
return nil
}
// Don't forget to flush logs on shutdown
func (s *HighPerformanceScraper) Close() {
s.logger.Sync()
}
Logging Best Practices for Web Scraping
1. Request/Response Logging
Always log essential HTTP request and response information:
func (s *Scraper) logHTTPTransaction(req *http.Request, resp *http.Response, duration time.Duration, err error) {
if err != nil {
s.logger.Error("HTTP request failed",
slog.String("method", req.Method),
slog.String("url", req.URL.String()),
slog.String("user_agent", req.Header.Get("User-Agent")),
slog.Duration("duration", duration),
slog.String("error", err.Error()),
)
} else {
s.logger.Info("HTTP request completed",
slog.String("method", req.Method),
slog.String("url", req.URL.String()),
slog.Int("status_code", resp.StatusCode),
slog.Int64("content_length", resp.ContentLength),
slog.String("content_type", resp.Header.Get("Content-Type")),
slog.Duration("duration", duration),
)
}
}
2. Rate Limiting and Delay Logging
Track rate limiting and delays to monitor scraping behavior:
type RateLimitedScraper struct {
logger *slog.Logger
lastRequest time.Time
minDelay time.Duration
}
func (s *RateLimitedScraper) FetchWithRateLimit(url string) error {
// Calculate required delay
elapsed := time.Since(s.lastRequest)
if elapsed < s.minDelay {
delay := s.minDelay - elapsed
s.logger.Info("Rate limiting delay",
slog.Duration("delay", delay),
slog.String("url", url),
)
time.Sleep(delay)
}
s.lastRequest = time.Now()
// Proceed with request...
return nil
}
3. Error Context and Recovery
Provide detailed error context for debugging:
func (s *Scraper) ParseHTML(html []byte, url string) (data map[string]string, err error) {
defer func() {
if r := recover(); r != nil {
s.logger.Error("HTML parsing panic recovered",
slog.String("url", url),
slog.Int("html_size", len(html)),
slog.Any("panic", r),
)
err = fmt.Errorf("parsing panic: %v", r)
}
}()
// HTML parsing logic here
data = make(map[string]string)
s.logger.Debug("HTML parsing completed",
slog.String("url", url),
slog.Int("fields_extracted", len(data)),
)
return data, nil
}
Monitoring and Metrics
Combine logging with metrics for comprehensive monitoring:
type MetricsCollector struct {
logger *slog.Logger
requestCount int64
errorCount int64
totalDuration time.Duration
}
func (m *MetricsCollector) LogMetrics() {
avgDuration := time.Duration(0)
if m.requestCount > 0 {
avgDuration = m.totalDuration / time.Duration(m.requestCount)
}
errorRate := float64(m.errorCount) / float64(m.requestCount) * 100
m.logger.Info("Scraping metrics",
slog.Int64("total_requests", m.requestCount),
slog.Int64("total_errors", m.errorCount),
slog.Float64("error_rate_percent", errorRate),
slog.Duration("average_duration", avgDuration),
)
}
Context-Aware Logging
Use Go's context package for tracing requests across your application:
func (s *Scraper) FetchWithContext(ctx context.Context, url string) ([]byte, error) {
// Extract request ID from context for correlation
requestID := ctx.Value("request_id")
s.logger.Info("Starting fetch with context",
slog.String("url", url),
slog.Any("request_id", requestID),
)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
s.logger.Error("Failed to create request",
slog.String("url", url),
slog.Any("request_id", requestID),
slog.String("error", err.Error()),
)
return nil, err
}
resp, err := s.client.Do(req)
if err != nil {
s.logger.Error("Request failed",
slog.String("url", url),
slog.Any("request_id", requestID),
slog.String("error", err.Error()),
)
return nil, err
}
defer resp.Body.Close()
// Read and return response
body := make([]byte, 1024)
_, err = resp.Body.Read(body)
return body, err
}
Configuration and Environment Setup
Set up logging configuration that adapts to different environments:
func SetupLogger(env string) *slog.Logger {
var handler slog.Handler
opts := &slog.HandlerOptions{
AddSource: true,
}
switch env {
case "development":
opts.Level = slog.LevelDebug
handler = slog.NewTextHandler(os.Stdout, opts)
case "production":
opts.Level = slog.LevelInfo
handler = slog.NewJSONHandler(os.Stdout, opts)
default:
opts.Level = slog.LevelWarn
handler = slog.NewJSONHandler(os.Stdout, opts)
}
return slog.New(handler)
}
// Usage example
func main() {
env := os.Getenv("ENVIRONMENT")
if env == "" {
env = "development"
}
logger := SetupLogger(env)
scraper := &Scraper{logger: logger}
// Use scraper...
}
Log Sampling and Performance
For high-volume scrapers, implement log sampling to reduce overhead:
type SampledLogger struct {
logger *slog.Logger
sampleRate int
counter int64
}
func NewSampledLogger(logger *slog.Logger, sampleRate int) *SampledLogger {
return &SampledLogger{
logger: logger,
sampleRate: sampleRate,
}
}
func (s *SampledLogger) LogIfSampled(level slog.Level, msg string, args ...any) {
s.counter++
if s.counter%int64(s.sampleRate) == 0 {
s.logger.Log(context.Background(), level, msg, args...)
}
}
Integration with External Services
For production deployments, consider integrating with logging services:
# Install required packages
go get go.uber.org/zap
go get github.com/sirupsen/logrus
go get gopkg.in/natefinch/lumberjack.v2
# For structured logging analysis
go get github.com/elastic/go-elasticsearch/v8
Example integration with structured logging for monitoring:
type StructuredScraper struct {
logger *slog.Logger
client *http.Client
sessionID string
}
func (s *StructuredScraper) LogScrapingSession(urls []string, results []ScrapingResult) {
s.logger.Info("Scraping session completed",
slog.String("session_id", s.sessionID),
slog.Int("urls_requested", len(urls)),
slog.Int("successful_scrapes", countSuccessful(results)),
slog.Int("failed_scrapes", countFailed(results)),
slog.Time("session_end", time.Now()),
)
}
type ScrapingResult struct {
URL string
Success bool
Error error
}
func countSuccessful(results []ScrapingResult) int {
count := 0
for _, r := range results {
if r.Success {
count++
}
}
return count
}
func countFailed(results []ScrapingResult) int {
count := 0
for _, r := range results {
if !r.Success {
count++
}
}
return count
}
Effective logging in Go web scraping projects enables you to monitor performance, debug issues, and maintain reliable scrapers. Whether you choose the built-in log/slog
package for simplicity, Logrus for features, or Zap for performance, consistent structured logging will significantly improve your scraping application's maintainability and observability.
Remember to balance logging verbosity with performance requirements, especially in high-throughput scraping scenarios, and always ensure sensitive data is properly sanitized before logging.