Table of contents

What is the best way to monitor Go web scraping applications?

Monitoring Go web scraping applications is essential for maintaining reliability, performance, and data quality. Effective monitoring involves tracking performance metrics, health status, error rates, and resource utilization to ensure your scrapers operate efficiently and detect issues before they impact your data collection.

Core Monitoring Components

1. Health Checks and Status Endpoints

Implement health check endpoints to monitor scraper availability and basic functionality:

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "sync"
    "time"
)

type HealthChecker struct {
    scrapers   map[string]*ScraperStatus
    mu         sync.RWMutex
    httpServer *http.Server
}

type ScraperStatus struct {
    Name            string    `json:"name"`
    Status          string    `json:"status"`
    LastHealthCheck time.Time `json:"last_health_check"`
    LastError       string    `json:"last_error,omitempty"`
    RequestCount    int64     `json:"request_count"`
    ErrorCount      int64     `json:"error_count"`
    UptimePercent   float64   `json:"uptime_percent"`
}

func NewHealthChecker() *HealthChecker {
    hc := &HealthChecker{
        scrapers: make(map[string]*ScraperStatus),
    }

    mux := http.NewServeMux()
    mux.HandleFunc("/health", hc.HealthHandler)
    mux.HandleFunc("/health/detailed", hc.DetailedHealthHandler)
    mux.HandleFunc("/metrics", hc.MetricsHandler)

    hc.httpServer = &http.Server{
        Addr:    ":8080",
        Handler: mux,
    }

    return hc
}

func (hc *HealthChecker) HealthHandler(w http.ResponseWriter, r *http.Request) {
    hc.mu.RLock()
    defer hc.mu.RUnlock()

    allHealthy := true
    for _, status := range hc.scrapers {
        if status.Status != "healthy" {
            allHealthy = false
            break
        }
    }

    w.Header().Set("Content-Type", "application/json")
    if allHealthy {
        w.WriteHeader(http.StatusOK)
        json.NewEncoder(w).Encode(map[string]string{
            "status": "healthy",
            "timestamp": time.Now().Format(time.RFC3339),
        })
    } else {
        w.WriteHeader(http.StatusServiceUnavailable)
        json.NewEncoder(w).Encode(map[string]string{
            "status": "unhealthy",
            "timestamp": time.Now().Format(time.RFC3339),
        })
    }
}

func (hc *HealthChecker) DetailedHealthHandler(w http.ResponseWriter, r *http.Request) {
    hc.mu.RLock()
    defer hc.mu.RUnlock()

    w.Header().Set("Content-Type", "application/json")
    json.NewEncoder(w).Encode(map[string]interface{}{
        "scrapers": hc.scrapers,
        "timestamp": time.Now().Format(time.RFC3339),
    })
}

func (hc *HealthChecker) UpdateScraperStatus(name string, healthy bool, errorMsg string) {
    hc.mu.Lock()
    defer hc.mu.Unlock()

    status := hc.scrapers[name]
    if status == nil {
        status = &ScraperStatus{
            Name: name,
        }
        hc.scrapers[name] = status
    }

    status.LastHealthCheck = time.Now()
    status.RequestCount++

    if healthy {
        status.Status = "healthy"
        status.LastError = ""
    } else {
        status.Status = "unhealthy"
        status.ErrorCount++
        status.LastError = errorMsg
    }

    // Calculate uptime percentage
    if status.RequestCount > 0 {
        status.UptimePercent = float64(status.RequestCount-status.ErrorCount) / float64(status.RequestCount) * 100
    }
}

2. Performance Metrics Collection

Track key performance indicators and scraping metrics:

package main

import (
    "sync"
    "time"
)

type MetricsCollector struct {
    mu                  sync.RWMutex
    TotalRequests       int64     `json:"total_requests"`
    SuccessfulRequests  int64     `json:"successful_requests"`
    FailedRequests      int64     `json:"failed_requests"`
    TotalResponseTime   int64     `json:"total_response_time_ms"`
    AverageResponseTime float64   `json:"average_response_time_ms"`
    RequestsPerSecond   float64   `json:"requests_per_second"`
    StartTime           time.Time `json:"start_time"`
    LastRequestTime     time.Time `json:"last_request_time"`
    BytesDownloaded     int64     `json:"bytes_downloaded"`
    PagesScraped        int64     `json:"pages_scraped"`
    DataPointsExtracted int64     `json:"data_points_extracted"`
}

func NewMetricsCollector() *MetricsCollector {
    return &MetricsCollector{
        StartTime: time.Now(),
    }
}

func (mc *MetricsCollector) RecordRequest(duration time.Duration, success bool, bytesReceived int64) {
    mc.mu.Lock()
    defer mc.mu.Unlock()

    mc.TotalRequests++
    mc.LastRequestTime = time.Now()
    mc.TotalResponseTime += duration.Milliseconds()
    mc.BytesDownloaded += bytesReceived

    if success {
        mc.SuccessfulRequests++
        mc.PagesScraped++
    } else {
        mc.FailedRequests++
    }

    // Calculate derived metrics
    mc.AverageResponseTime = float64(mc.TotalResponseTime) / float64(mc.TotalRequests)

    uptime := time.Since(mc.StartTime)
    if uptime.Seconds() > 0 {
        mc.RequestsPerSecond = float64(mc.TotalRequests) / uptime.Seconds()
    }
}

func (mc *MetricsCollector) RecordDataExtraction(dataPoints int64) {
    mc.mu.Lock()
    defer mc.mu.Unlock()
    mc.DataPointsExtracted += dataPoints
}

func (mc *MetricsCollector) GetMetrics() MetricsSnapshot {
    mc.mu.RLock()
    defer mc.mu.RUnlock()

    uptime := time.Since(mc.StartTime)
    errorRate := float64(0)
    if mc.TotalRequests > 0 {
        errorRate = float64(mc.FailedRequests) / float64(mc.TotalRequests) * 100
    }

    return MetricsSnapshot{
        TotalRequests:       mc.TotalRequests,
        SuccessfulRequests:  mc.SuccessfulRequests,
        FailedRequests:      mc.FailedRequests,
        ErrorRate:           errorRate,
        AverageResponseTime: mc.AverageResponseTime,
        RequestsPerSecond:   mc.RequestsPerSecond,
        UptimeSeconds:       uptime.Seconds(),
        BytesDownloaded:     mc.BytesDownloaded,
        PagesScraped:        mc.PagesScraped,
        DataPointsExtracted: mc.DataPointsExtracted,
        Timestamp:           time.Now(),
    }
}

type MetricsSnapshot struct {
    TotalRequests       int64     `json:"total_requests"`
    SuccessfulRequests  int64     `json:"successful_requests"`
    FailedRequests      int64     `json:"failed_requests"`
    ErrorRate           float64   `json:"error_rate_percent"`
    AverageResponseTime float64   `json:"average_response_time_ms"`
    RequestsPerSecond   float64   `json:"requests_per_second"`
    UptimeSeconds       float64   `json:"uptime_seconds"`
    BytesDownloaded     int64     `json:"bytes_downloaded"`
    PagesScraped        int64     `json:"pages_scraped"`
    DataPointsExtracted int64     `json:"data_points_extracted"`
    Timestamp           time.Time `json:"timestamp"`
}

3. Resource Utilization Monitoring

Monitor system resources to prevent performance degradation:

package main

import (
    "context"
    "runtime"
    "time"
)

type ResourceMonitor struct {
    ctx    context.Context
    cancel context.CancelFunc
    stats  *ResourceStats
}

type ResourceStats struct {
    MemoryUsage     MemoryStats   `json:"memory"`
    GoroutineCount  int           `json:"goroutine_count"`
    GCStats         GCStats       `json:"gc_stats"`
    CPUCount        int           `json:"cpu_count"`
    Timestamp       time.Time     `json:"timestamp"`
}

type MemoryStats struct {
    AllocMB      float64 `json:"alloc_mb"`
    TotalAllocMB float64 `json:"total_alloc_mb"`
    SysMB        float64 `json:"sys_mb"`
    HeapMB       float64 `json:"heap_mb"`
    StackMB      float64 `json:"stack_mb"`
}

type GCStats struct {
    NumGC        uint32  `json:"num_gc"`
    PauseTotalNs uint64  `json:"pause_total_ns"`
    PauseAvgMs   float64 `json:"pause_avg_ms"`
    NextGCMB     float64 `json:"next_gc_mb"`
}

func NewResourceMonitor() *ResourceMonitor {
    ctx, cancel := context.WithCancel(context.Background())
    rm := &ResourceMonitor{
        ctx:    ctx,
        cancel: cancel,
        stats:  &ResourceStats{},
    }

    go rm.collectStats()
    return rm
}

func (rm *ResourceMonitor) collectStats() {
    ticker := time.NewTicker(30 * time.Second)
    defer ticker.Stop()

    for {
        select {
        case <-rm.ctx.Done():
            return
        case <-ticker.C:
            rm.updateStats()
        }
    }
}

func (rm *ResourceMonitor) updateStats() {
    var m runtime.MemStats
    runtime.ReadMemStats(&m)

    rm.stats = &ResourceStats{
        MemoryUsage: MemoryStats{
            AllocMB:      float64(m.Alloc) / 1024 / 1024,
            TotalAllocMB: float64(m.TotalAlloc) / 1024 / 1024,
            SysMB:        float64(m.Sys) / 1024 / 1024,
            HeapMB:       float64(m.HeapAlloc) / 1024 / 1024,
            StackMB:      float64(m.StackSys) / 1024 / 1024,
        },
        GoroutineCount: runtime.NumGoroutine(),
        GCStats: GCStats{
            NumGC:        m.NumGC,
            PauseTotalNs: m.PauseTotalNs,
            PauseAvgMs:   float64(m.PauseTotalNs) / float64(m.NumGC) / 1e6,
            NextGCMB:     float64(m.NextGC) / 1024 / 1024,
        },
        CPUCount:  runtime.NumCPU(),
        Timestamp: time.Now(),
    }
}

func (rm *ResourceMonitor) GetStats() *ResourceStats {
    return rm.stats
}

func (rm *ResourceMonitor) Stop() {
    rm.cancel()
}

4. Alert System Implementation

Create an alerting system for critical issues:

package main

import (
    "fmt"
    "log"
    "time"
)

type AlertManager struct {
    thresholds   AlertThresholds
    lastAlerts   map[string]time.Time
    cooldownTime time.Duration
}

type AlertThresholds struct {
    ErrorRatePercent    float64 `json:"error_rate_percent"`
    ResponseTimeMs      float64 `json:"response_time_ms"`
    MemoryUsageMB       float64 `json:"memory_usage_mb"`
    GoroutineCount      int     `json:"goroutine_count"`
    ConsecutiveFailures int     `json:"consecutive_failures"`
}

type Alert struct {
    Type        string    `json:"type"`
    Message     string    `json:"message"`
    Severity    string    `json:"severity"`
    Timestamp   time.Time `json:"timestamp"`
    MetricValue float64   `json:"metric_value"`
    Threshold   float64   `json:"threshold"`
}

func NewAlertManager() *AlertManager {
    return &AlertManager{
        thresholds: AlertThresholds{
            ErrorRatePercent:    10.0,  // Alert if error rate > 10%
            ResponseTimeMs:      5000,  // Alert if avg response time > 5s
            MemoryUsageMB:       500,   // Alert if memory usage > 500MB
            GoroutineCount:      1000,  // Alert if goroutines > 1000
            ConsecutiveFailures: 5,     // Alert after 5 consecutive failures
        },
        lastAlerts:   make(map[string]time.Time),
        cooldownTime: 15 * time.Minute, // Don't spam alerts
    }
}

func (am *AlertManager) CheckMetrics(metrics MetricsSnapshot, resources *ResourceStats) []Alert {
    var alerts []Alert

    // Check error rate
    if metrics.ErrorRate > am.thresholds.ErrorRatePercent {
        if alert := am.createAlert("HIGH_ERROR_RATE", 
            fmt.Sprintf("Error rate %.2f%% exceeds threshold %.2f%%", 
                metrics.ErrorRate, am.thresholds.ErrorRatePercent),
            "critical", metrics.ErrorRate, am.thresholds.ErrorRatePercent); alert != nil {
            alerts = append(alerts, *alert)
        }
    }

    // Check response time
    if metrics.AverageResponseTime > am.thresholds.ResponseTimeMs {
        if alert := am.createAlert("SLOW_RESPONSE", 
            fmt.Sprintf("Average response time %.2fms exceeds threshold %.2fms", 
                metrics.AverageResponseTime, am.thresholds.ResponseTimeMs),
            "warning", metrics.AverageResponseTime, am.thresholds.ResponseTimeMs); alert != nil {
            alerts = append(alerts, *alert)
        }
    }

    // Check memory usage
    if resources != nil && resources.MemoryUsage.AllocMB > am.thresholds.MemoryUsageMB {
        if alert := am.createAlert("HIGH_MEMORY_USAGE", 
            fmt.Sprintf("Memory usage %.2fMB exceeds threshold %.2fMB", 
                resources.MemoryUsage.AllocMB, am.thresholds.MemoryUsageMB),
            "warning", resources.MemoryUsage.AllocMB, am.thresholds.MemoryUsageMB); alert != nil {
            alerts = append(alerts, *alert)
        }
    }

    return alerts
}

func (am *AlertManager) createAlert(alertType, message, severity string, value, threshold float64) *Alert {
    // Check cooldown period
    if lastAlert, exists := am.lastAlerts[alertType]; exists {
        if time.Since(lastAlert) < am.cooldownTime {
            return nil // Still in cooldown period
        }
    }

    am.lastAlerts[alertType] = time.Now()

    return &Alert{
        Type:        alertType,
        Message:     message,
        Severity:    severity,
        Timestamp:   time.Now(),
        MetricValue: value,
        Threshold:   threshold,
    }
}

func (am *AlertManager) SendAlert(alert Alert) {
    // Implement your alerting mechanism here
    // Could be email, Slack, PagerDuty, etc.
    log.Printf("ALERT [%s]: %s", alert.Severity, alert.Message)
}

5. Integration with Prometheus and Grafana

For advanced monitoring, integrate with Prometheus metrics:

package main

import (
    "net/http"

    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

type PrometheusMetrics struct {
    httpRequestsTotal     *prometheus.CounterVec
    httpRequestDuration   *prometheus.HistogramVec
    scrapingErrors        *prometheus.CounterVec
    pagesScraped          prometheus.Counter
    dataPointsExtracted   prometheus.Counter
    memoryUsage           prometheus.Gauge
    goroutineCount        prometheus.Gauge
}

func NewPrometheusMetrics() *PrometheusMetrics {
    pm := &PrometheusMetrics{
        httpRequestsTotal: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "scraper_http_requests_total",
                Help: "Total number of HTTP requests made by the scraper",
            },
            []string{"method", "status_code", "url_host"},
        ),
        httpRequestDuration: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "scraper_http_request_duration_seconds",
                Help:    "HTTP request duration in seconds",
                Buckets: prometheus.DefBuckets,
            },
            []string{"method", "url_host"},
        ),
        scrapingErrors: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "scraper_errors_total",
                Help: "Total number of scraping errors",
            },
            []string{"error_type", "url_host"},
        ),
        pagesScraped: prometheus.NewCounter(
            prometheus.CounterOpts{
                Name: "scraper_pages_scraped_total",
                Help: "Total number of pages successfully scraped",
            },
        ),
        dataPointsExtracted: prometheus.NewCounter(
            prometheus.CounterOpts{
                Name: "scraper_data_points_extracted_total",
                Help: "Total number of data points extracted",
            },
        ),
        memoryUsage: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "scraper_memory_usage_bytes",
                Help: "Current memory usage in bytes",
            },
        ),
        goroutineCount: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "scraper_goroutines_active",
                Help: "Number of active goroutines",
            },
        ),
    }

    // Register metrics
    prometheus.MustRegister(
        pm.httpRequestsTotal,
        pm.httpRequestDuration,
        pm.scrapingErrors,
        pm.pagesScraped,
        pm.dataPointsExtracted,
        pm.memoryUsage,
        pm.goroutineCount,
    )

    return pm
}

func (pm *PrometheusMetrics) RecordHTTPRequest(method, statusCode, host string, duration float64) {
    pm.httpRequestsTotal.WithLabelValues(method, statusCode, host).Inc()
    pm.httpRequestDuration.WithLabelValues(method, host).Observe(duration)
}

func (pm *PrometheusMetrics) RecordScrapingError(errorType, host string) {
    pm.scrapingErrors.WithLabelValues(errorType, host).Inc()
}

func (pm *PrometheusMetrics) StartMetricsServer(addr string) {
    http.Handle("/metrics", promhttp.Handler())
    go http.ListenAndServe(addr, nil)
}

Complete Monitoring Setup Example

package main

import (
    "fmt"
    "log"
    "net/http"
    "net/url"
    "time"
)

type MonitoredScraper struct {
    healthChecker     *HealthChecker
    metricsCollector  *MetricsCollector
    resourceMonitor   *ResourceMonitor
    alertManager      *AlertManager
    prometheusMetrics *PrometheusMetrics
    client            *http.Client
}

func NewMonitoredScraper() *MonitoredScraper {
    ms := &MonitoredScraper{
        healthChecker:     NewHealthChecker(),
        metricsCollector:  NewMetricsCollector(),
        resourceMonitor:   NewResourceMonitor(),
        alertManager:      NewAlertManager(),
        prometheusMetrics: NewPrometheusMetrics(),
        client:            &http.Client{Timeout: 30 * time.Second},
    }

    // Start monitoring services
    go ms.healthChecker.httpServer.ListenAndServe()
    ms.prometheusMetrics.StartMetricsServer(":9090")
    go ms.runPeriodicChecks()

    return ms
}

func (ms *MonitoredScraper) ScrapeURL(targetURL string) error {
    start := time.Now()

    // Make HTTP request
    resp, err := ms.client.Get(targetURL)
    duration := time.Since(start)

    // Update metrics
    success := err == nil && resp != nil && resp.StatusCode == 200
    bytesReceived := int64(0)

    if resp != nil {
        bytesReceived = resp.ContentLength
        resp.Body.Close()
    }

    ms.metricsCollector.RecordRequest(duration, success, bytesReceived)
    ms.healthChecker.UpdateScraperStatus("main_scraper", success, 
        func() string {
            if err != nil {
                return err.Error()
            }
            return ""
        }())

    // Update Prometheus metrics
    statusCode := "500"
    if resp != nil {
        statusCode = fmt.Sprintf("%d", resp.StatusCode)
    }

    ms.prometheusMetrics.RecordHTTPRequest("GET", statusCode, 
        extractHost(targetURL), duration.Seconds())

    if success {
        // Simulate data extraction
        ms.metricsCollector.RecordDataExtraction(5)
    } else {
        ms.prometheusMetrics.RecordScrapingError("http_error", extractHost(targetURL))
    }

    return err
}

func (ms *MonitoredScraper) runPeriodicChecks() {
    ticker := time.NewTicker(1 * time.Minute)
    defer ticker.Stop()

    for range ticker.C {
        // Get current metrics and resource stats
        metrics := ms.metricsCollector.GetMetrics()
        resources := ms.resourceMonitor.GetStats()

        // Check for alerts
        alerts := ms.alertManager.CheckMetrics(metrics, resources)
        for _, alert := range alerts {
            ms.alertManager.SendAlert(alert)
        }

        log.Printf("Monitoring check completed - Requests: %d, Errors: %d, Memory: %.2fMB", 
            metrics.TotalRequests, metrics.FailedRequests, resources.MemoryUsage.AllocMB)
    }
}

func extractHost(rawURL string) string {
    parsedURL, err := url.Parse(rawURL)
    if err != nil {
        return "unknown"
    }
    return parsedURL.Host
}

func main() {
    scraper := NewMonitoredScraper()

    // Example scraping loop
    urls := []string{
        "https://example.com/page1",
        "https://example.com/page2", 
        "https://example.com/page3",
    }

    for _, targetURL := range urls {
        err := scraper.ScrapeURL(targetURL)
        if err != nil {
            log.Printf("Failed to scrape %s: %v", targetURL, err)
        }

        time.Sleep(1 * time.Second) // Rate limiting
    }

    // Keep monitoring running
    select {}
}

Deployment and Infrastructure

# Install monitoring dependencies
go get github.com/prometheus/client_golang/prometheus
go get github.com/prometheus/client_golang/prometheus/promhttp

# Example prometheus.yml configuration
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'go-scraper'
    static_configs:
      - targets: ['localhost:9090']
    scrape_interval: 5s
    metrics_path: /metrics

Best Practices for Go Scraper Monitoring

  1. Implement Graceful Degradation: When monitoring services fail, ensure your scraper continues operating
  2. Use Circuit Breakers: Implement circuit breaker patterns to prevent cascade failures
  3. Monitor External Dependencies: Track the health and performance of target websites
  4. Set Up Log Aggregation: Centralize logs for easier debugging and analysis using structured logging approaches
  5. Implement Distributed Tracing: For complex scrapers, use tools like Jaeger for request tracing
  6. Regular Health Checks: Implement periodic self-tests to verify scraper functionality
  7. Monitor Data Quality: Track extracted data completeness and validity alongside proper error handling practices

Integration with External Monitoring Services

For production deployments, consider integrating with external monitoring services:

# Example Docker Compose setup
version: '3.8'
services:
  scraper:
    build: .
    ports:
      - "8080:8080"  # Health checks
      - "9090:9090"  # Prometheus metrics
    environment:
      - ENVIRONMENT=production

  prometheus:
    image: prom/prometheus
    ports:
      - "9091:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml

  grafana:
    image: grafana/grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin

Monitoring Concurrent Scraping Operations

When implementing concurrent web scraping in Go, monitor goroutine pools and resource usage:

func (ms *MonitoredScraper) MonitorConcurrentScraping(urls []string, maxConcurrency int) {
    semaphore := make(chan struct{}, maxConcurrency)
    var wg sync.WaitGroup

    for _, targetURL := range urls {
        wg.Add(1)
        go func(url string) {
            defer wg.Done()
            semaphore <- struct{}{} // Acquire
            defer func() { <-semaphore }() // Release

            // Track concurrent operation metrics
            start := time.Now()
            err := ms.ScrapeURL(url)
            duration := time.Since(start)

            // Log concurrent operation results
            log.Printf("Concurrent scrape completed for %s in %v", url, duration)
        }(targetURL)
    }

    wg.Wait()
}

Comprehensive monitoring transforms your Go web scraping applications from black boxes into observable, maintainable systems. By implementing health checks, metrics collection, alerting, and integration with modern monitoring tools, you can ensure reliable data collection and quickly respond to issues before they impact your operations.

Remember to balance monitoring overhead with scraper performance, and always monitor both technical metrics (response times, error rates) and business metrics (data quality, extraction rates) for a complete picture of your scraping application's health.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon