What is the best way to monitor Go web scraping applications?
Monitoring Go web scraping applications is essential for maintaining reliability, performance, and data quality. Effective monitoring involves tracking performance metrics, health status, error rates, and resource utilization to ensure your scrapers operate efficiently and detect issues before they impact your data collection.
Core Monitoring Components
1. Health Checks and Status Endpoints
Implement health check endpoints to monitor scraper availability and basic functionality:
package main
import (
"context"
"encoding/json"
"fmt"
"log"
"net/http"
"sync"
"time"
)
type HealthChecker struct {
scrapers map[string]*ScraperStatus
mu sync.RWMutex
httpServer *http.Server
}
type ScraperStatus struct {
Name string `json:"name"`
Status string `json:"status"`
LastHealthCheck time.Time `json:"last_health_check"`
LastError string `json:"last_error,omitempty"`
RequestCount int64 `json:"request_count"`
ErrorCount int64 `json:"error_count"`
UptimePercent float64 `json:"uptime_percent"`
}
func NewHealthChecker() *HealthChecker {
hc := &HealthChecker{
scrapers: make(map[string]*ScraperStatus),
}
mux := http.NewServeMux()
mux.HandleFunc("/health", hc.HealthHandler)
mux.HandleFunc("/health/detailed", hc.DetailedHealthHandler)
mux.HandleFunc("/metrics", hc.MetricsHandler)
hc.httpServer = &http.Server{
Addr: ":8080",
Handler: mux,
}
return hc
}
func (hc *HealthChecker) HealthHandler(w http.ResponseWriter, r *http.Request) {
hc.mu.RLock()
defer hc.mu.RUnlock()
allHealthy := true
for _, status := range hc.scrapers {
if status.Status != "healthy" {
allHealthy = false
break
}
}
w.Header().Set("Content-Type", "application/json")
if allHealthy {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{
"status": "healthy",
"timestamp": time.Now().Format(time.RFC3339),
})
} else {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]string{
"status": "unhealthy",
"timestamp": time.Now().Format(time.RFC3339),
})
}
}
func (hc *HealthChecker) DetailedHealthHandler(w http.ResponseWriter, r *http.Request) {
hc.mu.RLock()
defer hc.mu.RUnlock()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"scrapers": hc.scrapers,
"timestamp": time.Now().Format(time.RFC3339),
})
}
func (hc *HealthChecker) UpdateScraperStatus(name string, healthy bool, errorMsg string) {
hc.mu.Lock()
defer hc.mu.Unlock()
status := hc.scrapers[name]
if status == nil {
status = &ScraperStatus{
Name: name,
}
hc.scrapers[name] = status
}
status.LastHealthCheck = time.Now()
status.RequestCount++
if healthy {
status.Status = "healthy"
status.LastError = ""
} else {
status.Status = "unhealthy"
status.ErrorCount++
status.LastError = errorMsg
}
// Calculate uptime percentage
if status.RequestCount > 0 {
status.UptimePercent = float64(status.RequestCount-status.ErrorCount) / float64(status.RequestCount) * 100
}
}
2. Performance Metrics Collection
Track key performance indicators and scraping metrics:
package main
import (
"sync"
"time"
)
type MetricsCollector struct {
mu sync.RWMutex
TotalRequests int64 `json:"total_requests"`
SuccessfulRequests int64 `json:"successful_requests"`
FailedRequests int64 `json:"failed_requests"`
TotalResponseTime int64 `json:"total_response_time_ms"`
AverageResponseTime float64 `json:"average_response_time_ms"`
RequestsPerSecond float64 `json:"requests_per_second"`
StartTime time.Time `json:"start_time"`
LastRequestTime time.Time `json:"last_request_time"`
BytesDownloaded int64 `json:"bytes_downloaded"`
PagesScraped int64 `json:"pages_scraped"`
DataPointsExtracted int64 `json:"data_points_extracted"`
}
func NewMetricsCollector() *MetricsCollector {
return &MetricsCollector{
StartTime: time.Now(),
}
}
func (mc *MetricsCollector) RecordRequest(duration time.Duration, success bool, bytesReceived int64) {
mc.mu.Lock()
defer mc.mu.Unlock()
mc.TotalRequests++
mc.LastRequestTime = time.Now()
mc.TotalResponseTime += duration.Milliseconds()
mc.BytesDownloaded += bytesReceived
if success {
mc.SuccessfulRequests++
mc.PagesScraped++
} else {
mc.FailedRequests++
}
// Calculate derived metrics
mc.AverageResponseTime = float64(mc.TotalResponseTime) / float64(mc.TotalRequests)
uptime := time.Since(mc.StartTime)
if uptime.Seconds() > 0 {
mc.RequestsPerSecond = float64(mc.TotalRequests) / uptime.Seconds()
}
}
func (mc *MetricsCollector) RecordDataExtraction(dataPoints int64) {
mc.mu.Lock()
defer mc.mu.Unlock()
mc.DataPointsExtracted += dataPoints
}
func (mc *MetricsCollector) GetMetrics() MetricsSnapshot {
mc.mu.RLock()
defer mc.mu.RUnlock()
uptime := time.Since(mc.StartTime)
errorRate := float64(0)
if mc.TotalRequests > 0 {
errorRate = float64(mc.FailedRequests) / float64(mc.TotalRequests) * 100
}
return MetricsSnapshot{
TotalRequests: mc.TotalRequests,
SuccessfulRequests: mc.SuccessfulRequests,
FailedRequests: mc.FailedRequests,
ErrorRate: errorRate,
AverageResponseTime: mc.AverageResponseTime,
RequestsPerSecond: mc.RequestsPerSecond,
UptimeSeconds: uptime.Seconds(),
BytesDownloaded: mc.BytesDownloaded,
PagesScraped: mc.PagesScraped,
DataPointsExtracted: mc.DataPointsExtracted,
Timestamp: time.Now(),
}
}
type MetricsSnapshot struct {
TotalRequests int64 `json:"total_requests"`
SuccessfulRequests int64 `json:"successful_requests"`
FailedRequests int64 `json:"failed_requests"`
ErrorRate float64 `json:"error_rate_percent"`
AverageResponseTime float64 `json:"average_response_time_ms"`
RequestsPerSecond float64 `json:"requests_per_second"`
UptimeSeconds float64 `json:"uptime_seconds"`
BytesDownloaded int64 `json:"bytes_downloaded"`
PagesScraped int64 `json:"pages_scraped"`
DataPointsExtracted int64 `json:"data_points_extracted"`
Timestamp time.Time `json:"timestamp"`
}
3. Resource Utilization Monitoring
Monitor system resources to prevent performance degradation:
package main
import (
"context"
"runtime"
"time"
)
type ResourceMonitor struct {
ctx context.Context
cancel context.CancelFunc
stats *ResourceStats
}
type ResourceStats struct {
MemoryUsage MemoryStats `json:"memory"`
GoroutineCount int `json:"goroutine_count"`
GCStats GCStats `json:"gc_stats"`
CPUCount int `json:"cpu_count"`
Timestamp time.Time `json:"timestamp"`
}
type MemoryStats struct {
AllocMB float64 `json:"alloc_mb"`
TotalAllocMB float64 `json:"total_alloc_mb"`
SysMB float64 `json:"sys_mb"`
HeapMB float64 `json:"heap_mb"`
StackMB float64 `json:"stack_mb"`
}
type GCStats struct {
NumGC uint32 `json:"num_gc"`
PauseTotalNs uint64 `json:"pause_total_ns"`
PauseAvgMs float64 `json:"pause_avg_ms"`
NextGCMB float64 `json:"next_gc_mb"`
}
func NewResourceMonitor() *ResourceMonitor {
ctx, cancel := context.WithCancel(context.Background())
rm := &ResourceMonitor{
ctx: ctx,
cancel: cancel,
stats: &ResourceStats{},
}
go rm.collectStats()
return rm
}
func (rm *ResourceMonitor) collectStats() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-rm.ctx.Done():
return
case <-ticker.C:
rm.updateStats()
}
}
}
func (rm *ResourceMonitor) updateStats() {
var m runtime.MemStats
runtime.ReadMemStats(&m)
rm.stats = &ResourceStats{
MemoryUsage: MemoryStats{
AllocMB: float64(m.Alloc) / 1024 / 1024,
TotalAllocMB: float64(m.TotalAlloc) / 1024 / 1024,
SysMB: float64(m.Sys) / 1024 / 1024,
HeapMB: float64(m.HeapAlloc) / 1024 / 1024,
StackMB: float64(m.StackSys) / 1024 / 1024,
},
GoroutineCount: runtime.NumGoroutine(),
GCStats: GCStats{
NumGC: m.NumGC,
PauseTotalNs: m.PauseTotalNs,
PauseAvgMs: float64(m.PauseTotalNs) / float64(m.NumGC) / 1e6,
NextGCMB: float64(m.NextGC) / 1024 / 1024,
},
CPUCount: runtime.NumCPU(),
Timestamp: time.Now(),
}
}
func (rm *ResourceMonitor) GetStats() *ResourceStats {
return rm.stats
}
func (rm *ResourceMonitor) Stop() {
rm.cancel()
}
4. Alert System Implementation
Create an alerting system for critical issues:
package main
import (
"fmt"
"log"
"time"
)
type AlertManager struct {
thresholds AlertThresholds
lastAlerts map[string]time.Time
cooldownTime time.Duration
}
type AlertThresholds struct {
ErrorRatePercent float64 `json:"error_rate_percent"`
ResponseTimeMs float64 `json:"response_time_ms"`
MemoryUsageMB float64 `json:"memory_usage_mb"`
GoroutineCount int `json:"goroutine_count"`
ConsecutiveFailures int `json:"consecutive_failures"`
}
type Alert struct {
Type string `json:"type"`
Message string `json:"message"`
Severity string `json:"severity"`
Timestamp time.Time `json:"timestamp"`
MetricValue float64 `json:"metric_value"`
Threshold float64 `json:"threshold"`
}
func NewAlertManager() *AlertManager {
return &AlertManager{
thresholds: AlertThresholds{
ErrorRatePercent: 10.0, // Alert if error rate > 10%
ResponseTimeMs: 5000, // Alert if avg response time > 5s
MemoryUsageMB: 500, // Alert if memory usage > 500MB
GoroutineCount: 1000, // Alert if goroutines > 1000
ConsecutiveFailures: 5, // Alert after 5 consecutive failures
},
lastAlerts: make(map[string]time.Time),
cooldownTime: 15 * time.Minute, // Don't spam alerts
}
}
func (am *AlertManager) CheckMetrics(metrics MetricsSnapshot, resources *ResourceStats) []Alert {
var alerts []Alert
// Check error rate
if metrics.ErrorRate > am.thresholds.ErrorRatePercent {
if alert := am.createAlert("HIGH_ERROR_RATE",
fmt.Sprintf("Error rate %.2f%% exceeds threshold %.2f%%",
metrics.ErrorRate, am.thresholds.ErrorRatePercent),
"critical", metrics.ErrorRate, am.thresholds.ErrorRatePercent); alert != nil {
alerts = append(alerts, *alert)
}
}
// Check response time
if metrics.AverageResponseTime > am.thresholds.ResponseTimeMs {
if alert := am.createAlert("SLOW_RESPONSE",
fmt.Sprintf("Average response time %.2fms exceeds threshold %.2fms",
metrics.AverageResponseTime, am.thresholds.ResponseTimeMs),
"warning", metrics.AverageResponseTime, am.thresholds.ResponseTimeMs); alert != nil {
alerts = append(alerts, *alert)
}
}
// Check memory usage
if resources != nil && resources.MemoryUsage.AllocMB > am.thresholds.MemoryUsageMB {
if alert := am.createAlert("HIGH_MEMORY_USAGE",
fmt.Sprintf("Memory usage %.2fMB exceeds threshold %.2fMB",
resources.MemoryUsage.AllocMB, am.thresholds.MemoryUsageMB),
"warning", resources.MemoryUsage.AllocMB, am.thresholds.MemoryUsageMB); alert != nil {
alerts = append(alerts, *alert)
}
}
return alerts
}
func (am *AlertManager) createAlert(alertType, message, severity string, value, threshold float64) *Alert {
// Check cooldown period
if lastAlert, exists := am.lastAlerts[alertType]; exists {
if time.Since(lastAlert) < am.cooldownTime {
return nil // Still in cooldown period
}
}
am.lastAlerts[alertType] = time.Now()
return &Alert{
Type: alertType,
Message: message,
Severity: severity,
Timestamp: time.Now(),
MetricValue: value,
Threshold: threshold,
}
}
func (am *AlertManager) SendAlert(alert Alert) {
// Implement your alerting mechanism here
// Could be email, Slack, PagerDuty, etc.
log.Printf("ALERT [%s]: %s", alert.Severity, alert.Message)
}
5. Integration with Prometheus and Grafana
For advanced monitoring, integrate with Prometheus metrics:
package main
import (
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
type PrometheusMetrics struct {
httpRequestsTotal *prometheus.CounterVec
httpRequestDuration *prometheus.HistogramVec
scrapingErrors *prometheus.CounterVec
pagesScraped prometheus.Counter
dataPointsExtracted prometheus.Counter
memoryUsage prometheus.Gauge
goroutineCount prometheus.Gauge
}
func NewPrometheusMetrics() *PrometheusMetrics {
pm := &PrometheusMetrics{
httpRequestsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "scraper_http_requests_total",
Help: "Total number of HTTP requests made by the scraper",
},
[]string{"method", "status_code", "url_host"},
),
httpRequestDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "scraper_http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "url_host"},
),
scrapingErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "scraper_errors_total",
Help: "Total number of scraping errors",
},
[]string{"error_type", "url_host"},
),
pagesScraped: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "scraper_pages_scraped_total",
Help: "Total number of pages successfully scraped",
},
),
dataPointsExtracted: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "scraper_data_points_extracted_total",
Help: "Total number of data points extracted",
},
),
memoryUsage: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "scraper_memory_usage_bytes",
Help: "Current memory usage in bytes",
},
),
goroutineCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "scraper_goroutines_active",
Help: "Number of active goroutines",
},
),
}
// Register metrics
prometheus.MustRegister(
pm.httpRequestsTotal,
pm.httpRequestDuration,
pm.scrapingErrors,
pm.pagesScraped,
pm.dataPointsExtracted,
pm.memoryUsage,
pm.goroutineCount,
)
return pm
}
func (pm *PrometheusMetrics) RecordHTTPRequest(method, statusCode, host string, duration float64) {
pm.httpRequestsTotal.WithLabelValues(method, statusCode, host).Inc()
pm.httpRequestDuration.WithLabelValues(method, host).Observe(duration)
}
func (pm *PrometheusMetrics) RecordScrapingError(errorType, host string) {
pm.scrapingErrors.WithLabelValues(errorType, host).Inc()
}
func (pm *PrometheusMetrics) StartMetricsServer(addr string) {
http.Handle("/metrics", promhttp.Handler())
go http.ListenAndServe(addr, nil)
}
Complete Monitoring Setup Example
package main
import (
"fmt"
"log"
"net/http"
"net/url"
"time"
)
type MonitoredScraper struct {
healthChecker *HealthChecker
metricsCollector *MetricsCollector
resourceMonitor *ResourceMonitor
alertManager *AlertManager
prometheusMetrics *PrometheusMetrics
client *http.Client
}
func NewMonitoredScraper() *MonitoredScraper {
ms := &MonitoredScraper{
healthChecker: NewHealthChecker(),
metricsCollector: NewMetricsCollector(),
resourceMonitor: NewResourceMonitor(),
alertManager: NewAlertManager(),
prometheusMetrics: NewPrometheusMetrics(),
client: &http.Client{Timeout: 30 * time.Second},
}
// Start monitoring services
go ms.healthChecker.httpServer.ListenAndServe()
ms.prometheusMetrics.StartMetricsServer(":9090")
go ms.runPeriodicChecks()
return ms
}
func (ms *MonitoredScraper) ScrapeURL(targetURL string) error {
start := time.Now()
// Make HTTP request
resp, err := ms.client.Get(targetURL)
duration := time.Since(start)
// Update metrics
success := err == nil && resp != nil && resp.StatusCode == 200
bytesReceived := int64(0)
if resp != nil {
bytesReceived = resp.ContentLength
resp.Body.Close()
}
ms.metricsCollector.RecordRequest(duration, success, bytesReceived)
ms.healthChecker.UpdateScraperStatus("main_scraper", success,
func() string {
if err != nil {
return err.Error()
}
return ""
}())
// Update Prometheus metrics
statusCode := "500"
if resp != nil {
statusCode = fmt.Sprintf("%d", resp.StatusCode)
}
ms.prometheusMetrics.RecordHTTPRequest("GET", statusCode,
extractHost(targetURL), duration.Seconds())
if success {
// Simulate data extraction
ms.metricsCollector.RecordDataExtraction(5)
} else {
ms.prometheusMetrics.RecordScrapingError("http_error", extractHost(targetURL))
}
return err
}
func (ms *MonitoredScraper) runPeriodicChecks() {
ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()
for range ticker.C {
// Get current metrics and resource stats
metrics := ms.metricsCollector.GetMetrics()
resources := ms.resourceMonitor.GetStats()
// Check for alerts
alerts := ms.alertManager.CheckMetrics(metrics, resources)
for _, alert := range alerts {
ms.alertManager.SendAlert(alert)
}
log.Printf("Monitoring check completed - Requests: %d, Errors: %d, Memory: %.2fMB",
metrics.TotalRequests, metrics.FailedRequests, resources.MemoryUsage.AllocMB)
}
}
func extractHost(rawURL string) string {
parsedURL, err := url.Parse(rawURL)
if err != nil {
return "unknown"
}
return parsedURL.Host
}
func main() {
scraper := NewMonitoredScraper()
// Example scraping loop
urls := []string{
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
}
for _, targetURL := range urls {
err := scraper.ScrapeURL(targetURL)
if err != nil {
log.Printf("Failed to scrape %s: %v", targetURL, err)
}
time.Sleep(1 * time.Second) // Rate limiting
}
// Keep monitoring running
select {}
}
Deployment and Infrastructure
# Install monitoring dependencies
go get github.com/prometheus/client_golang/prometheus
go get github.com/prometheus/client_golang/prometheus/promhttp
# Example prometheus.yml configuration
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'go-scraper'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 5s
metrics_path: /metrics
Best Practices for Go Scraper Monitoring
- Implement Graceful Degradation: When monitoring services fail, ensure your scraper continues operating
- Use Circuit Breakers: Implement circuit breaker patterns to prevent cascade failures
- Monitor External Dependencies: Track the health and performance of target websites
- Set Up Log Aggregation: Centralize logs for easier debugging and analysis using structured logging approaches
- Implement Distributed Tracing: For complex scrapers, use tools like Jaeger for request tracing
- Regular Health Checks: Implement periodic self-tests to verify scraper functionality
- Monitor Data Quality: Track extracted data completeness and validity alongside proper error handling practices
Integration with External Monitoring Services
For production deployments, consider integrating with external monitoring services:
# Example Docker Compose setup
version: '3.8'
services:
scraper:
build: .
ports:
- "8080:8080" # Health checks
- "9090:9090" # Prometheus metrics
environment:
- ENVIRONMENT=production
prometheus:
image: prom/prometheus
ports:
- "9091:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
Monitoring Concurrent Scraping Operations
When implementing concurrent web scraping in Go, monitor goroutine pools and resource usage:
func (ms *MonitoredScraper) MonitorConcurrentScraping(urls []string, maxConcurrency int) {
semaphore := make(chan struct{}, maxConcurrency)
var wg sync.WaitGroup
for _, targetURL := range urls {
wg.Add(1)
go func(url string) {
defer wg.Done()
semaphore <- struct{}{} // Acquire
defer func() { <-semaphore }() // Release
// Track concurrent operation metrics
start := time.Now()
err := ms.ScrapeURL(url)
duration := time.Since(start)
// Log concurrent operation results
log.Printf("Concurrent scrape completed for %s in %v", url, duration)
}(targetURL)
}
wg.Wait()
}
Comprehensive monitoring transforms your Go web scraping applications from black boxes into observable, maintainable systems. By implementing health checks, metrics collection, alerting, and integration with modern monitoring tools, you can ensure reliable data collection and quickly respond to issues before they impact your operations.
Remember to balance monitoring overhead with scraper performance, and always monitor both technical metrics (response times, error rates) and business metrics (data quality, extraction rates) for a complete picture of your scraping application's health.