apiVersion: v1 kind: Service metadata: name: go-scraper-service spec: selector: app: go-scraper ports: - port: 80 targetPort: 8080 type: LoadBalancer ```
Cloud Platform Deployments
AWS Deployment Options
AWS ECS (Elastic Container Service):
{
"family": "go-scraper-task",
"networkMode": "awsvpc",
"requiresCompatibilities": ["FARGATE"],
"cpu": "512",
"memory": "1024",
"executionRoleArn": "arn:aws:iam::account:role/ecsTaskExecutionRole",
"containerDefinitions": [
{
"name": "scraper",
"image": "your-account.dkr.ecr.region.amazonaws.com/go-scraper:latest",
"portMappings": [
{
"containerPort": 8080,
"protocol": "tcp"
}
],
"environment": [
{
"name": "ENV",
"value": "production"
}
],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "/ecs/go-scraper",
"awslogs-region": "us-west-2",
"awslogs-stream-prefix": "ecs"
}
}
}
]
}
AWS Lambda for Serverless Scraping:
package main
import (
"context"
"encoding/json"
"github.com/aws/aws-lambda-go/events"
"github.com/aws/aws-lambda-go/lambda"
)
type ScrapingRequest struct {
URLs []string `json:"urls"`
Options map[string]interface{} `json:"options"`
}
type ScrapingResponse struct {
Results []ScrapedData `json:"results"`
Status string `json:"status"`
}
func handleRequest(ctx context.Context, request events.APIGatewayProxyRequest) (events.APIGatewayProxyResponse, error) {
var req ScrapingRequest
json.Unmarshal([]byte(request.Body), &req)
// Perform scraping logic
results := performScraping(req.URLs, req.Options)
response := ScrapingResponse{
Results: results,
Status: "success",
}
body, _ := json.Marshal(response)
return events.APIGatewayProxyResponse{
StatusCode: 200,
Headers: map[string]string{
"Content-Type": "application/json",
"Access-Control-Allow-Origin": "*",
},
Body: string(body),
}, nil
}
func main() {
lambda.Start(handleRequest)
}
Google Cloud Platform
Cloud Run deployment:
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: go-scraper
annotations:
run.googleapis.com/ingress: all
spec:
template:
metadata:
annotations:
autoscaling.knative.dev/maxScale: "10"
run.googleapis.com/cpu-throttling: "false"
spec:
containerConcurrency: 80
containers:
- image: gcr.io/project-id/go-scraper:latest
ports:
- containerPort: 8080
env:
- name: PORT
value: "8080"
resources:
limits:
cpu: "2"
memory: "2Gi"
Deploy with:
# Build and push to Container Registry
gcloud builds submit --tag gcr.io/PROJECT-ID/go-scraper
# Deploy to Cloud Run
gcloud run deploy go-scraper \
--image gcr.io/PROJECT-ID/go-scraper \
--platform managed \
--region us-central1 \
--allow-unauthenticated
Production Configuration Best Practices
Environment Configuration
Create a robust configuration system:
package config
import (
"os"
"strconv"
"time"
)
type Config struct {
Port string
DatabaseURL string
RedisURL string
UserAgent string
RequestTimeout time.Duration
MaxWorkers int
LogLevel string
Environment string
}
func Load() *Config {
return &Config{
Port: getEnv("PORT", "8080"),
DatabaseURL: getEnv("DATABASE_URL", ""),
RedisURL: getEnv("REDIS_URL", ""),
UserAgent: getEnv("USER_AGENT", "GoScraper/1.0"),
RequestTimeout: getDurationEnv("REQUEST_TIMEOUT", 30*time.Second),
MaxWorkers: getIntEnv("MAX_WORKERS", 10),
LogLevel: getEnv("LOG_LEVEL", "info"),
Environment: getEnv("ENV", "development"),
}
}
func getEnv(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func getIntEnv(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intValue, err := strconv.Atoi(value); err == nil {
return intValue
}
}
return defaultValue
}
func getDurationEnv(key string, defaultValue time.Duration) time.Duration {
if value := os.Getenv(key); value != "" {
if duration, err := time.ParseDuration(value); err == nil {
return duration
}
}
return defaultValue
}
Health Checks and Monitoring
Implement comprehensive health checks:
package main
import (
"encoding/json"
"net/http"
"time"
)
type HealthResponse struct {
Status string `json:"status"`
Timestamp time.Time `json:"timestamp"`
Services map[string]string `json:"services"`
Version string `json:"version"`
}
func healthHandler(w http.ResponseWriter, r *http.Request) {
health := HealthResponse{
Status: "healthy",
Timestamp: time.Now(),
Services: make(map[string]string),
Version: "1.0.0",
}
// Check database connectivity
if err := checkDatabase(); err != nil {
health.Services["database"] = "unhealthy"
health.Status = "unhealthy"
} else {
health.Services["database"] = "healthy"
}
// Check Redis connectivity
if err := checkRedis(); err != nil {
health.Services["redis"] = "unhealthy"
health.Status = "unhealthy"
} else {
health.Services["redis"] = "healthy"
}
w.Header().Set("Content-Type", "application/json")
if health.Status == "unhealthy" {
w.WriteHeader(http.StatusServiceUnavailable)
}
json.NewEncoder(w).Encode(health)
}
func readinessHandler(w http.ResponseWriter, r *http.Request) {
// Check if application is ready to serve traffic
if !isApplicationReady() {
w.WriteHeader(http.StatusServiceUnavailable)
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte("ready"))
}
CI/CD Pipeline Setup
GitHub Actions Workflow
name: Deploy Go Scraper
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: 1.21
- name: Run tests
run: |
go mod download
go test -v ./...
- name: Run security scan
uses: securecodewarrior/github-action-gosec@master
build-and-deploy:
needs: test
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@v3
with:
context: .
push: true
tags: ghcr.io/${{ github.repository }}:latest
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Deploy to production
run: |
# Add your deployment commands here
kubectl set image deployment/go-scraper scraper=ghcr.io/${{ github.repository }}:latest
Security Considerations
Rate Limiting and Anti-Detection
While developing web scraping applications, it's important to implement responsible scraping practices. For handling JavaScript-heavy sites that require browser automation, you might want to consider how to use Puppeteer with Docker for containerized browser automation solutions.
package ratelimit
import (
"sync"
"time"
)
type RateLimiter struct {
rate time.Duration
lastCall time.Time
mutex sync.Mutex
}
func NewRateLimiter(requestsPerSecond float64) *RateLimiter {
return &RateLimiter{
rate: time.Duration(float64(time.Second) / requestsPerSecond),
}
}
func (rl *RateLimiter) Wait() {
rl.mutex.Lock()
defer rl.mutex.Unlock()
elapsed := time.Since(rl.lastCall)
if elapsed < rl.rate {
time.Sleep(rl.rate - elapsed)
}
rl.lastCall = time.Now()
}
Resource Management
func (s *Scraper) scrapeWithRetry(url string, maxRetries int) (*http.Response, error) {
var resp *http.Response
var err error
for attempt := 0; attempt <= maxRetries; attempt++ {
// Rate limiting
s.rateLimiter.Wait()
// Create request with timeout
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
// Set headers to appear more human-like
req.Header.Set("User-Agent", s.getUserAgent())
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
resp, err = s.client.Do(req)
if err == nil && resp.StatusCode == 200 {
return resp, nil
}
if resp != nil {
resp.Body.Close()
}
// Exponential backoff
if attempt < maxRetries {
backoff := time.Duration(attempt*attempt) * time.Second
time.Sleep(backoff)
}
}
return nil, err
}
Monitoring and Observability
Structured Logging
package logger
import (
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
func NewLogger(level string) (*zap.Logger, error) {
config := zap.NewProductionConfig()
config.EncoderConfig.TimeKey = "timestamp"
config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
switch level {
case "debug":
config.Level.SetLevel(zap.DebugLevel)
case "info":
config.Level.SetLevel(zap.InfoLevel)
case "warn":
config.Level.SetLevel(zap.WarnLevel)
case "error":
config.Level.SetLevel(zap.ErrorLevel)
}
return config.Build()
}
// Usage in scraping code
func (s *Scraper) scrapePage(url string) error {
start := time.Now()
s.logger.Info("Starting page scrape",
zap.String("url", url),
zap.Time("started_at", start),
)
defer func() {
s.logger.Info("Completed page scrape",
zap.String("url", url),
zap.Duration("duration", time.Since(start)),
)
}()
// Scraping logic here
return nil
}
Metrics Collection
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
requestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "scraper_requests_total",
Help: "Total number of scraping requests",
},
[]string{"status", "domain"},
)
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "scraper_request_duration_seconds",
Help: "Duration of scraping requests",
Buckets: prometheus.DefBuckets,
},
[]string{"domain"},
)
)
func init() {
prometheus.MustRegister(requestsTotal)
prometheus.MustRegister(requestDuration)
}
func RecordRequest(domain, status string, duration float64) {
requestsTotal.WithLabelValues(status, domain).Inc()
requestDuration.WithLabelValues(domain).Observe(duration)
}
For complex single-page applications that require advanced interaction patterns, consider learning about how to handle AJAX requests using Puppeteer for dynamic content handling.
Scaling Strategies
Horizontal Scaling with Worker Pools
package worker
import (
"context"
"sync"
)
type Job struct {
URL string
Config ScrapingConfig
}
type Worker struct {
ID int
jobs chan Job
quit chan bool
wg *sync.WaitGroup
}
type Pool struct {
workers []*Worker
jobs chan Job
wg sync.WaitGroup
}
func NewPool(numWorkers int) *Pool {
pool := &Pool{
workers: make([]*Worker, numWorkers),
jobs: make(chan Job, numWorkers*2),
}
for i := 0; i < numWorkers; i++ {
worker := &Worker{
ID: i,
jobs: pool.jobs,
quit: make(chan bool),
wg: &pool.wg,
}
pool.workers[i] = worker
go worker.start()
}
return pool
}
func (w *Worker) start() {
for {
select {
case job := <-w.jobs:
w.processJob(job)
case <-w.quit:
return
}
}
}
Conclusion
Deploying Go web scraping applications successfully requires attention to containerization, cloud platform integration, monitoring, and scaling strategies. The combination of Go's performance characteristics and modern deployment practices creates robust, maintainable scraping solutions.
Key deployment considerations include:
- Use Docker for consistent, portable deployments
- Implement comprehensive health checks and monitoring
- Set up proper CI/CD pipelines for automated testing and deployment
- Apply security best practices including rate limiting and resource management
- Design for horizontal scaling with worker pools and queue systems
By following these best practices, you'll create production-ready Go web scraping applications that can handle scale while maintaining reliability and performance.