Table of contents

What are Go context best practices for web scraping?

Go's context package is essential for building robust and efficient web scraping applications. It provides mechanisms for timeout handling, cancellation, request lifecycle management, and graceful shutdowns. Understanding context best practices is crucial for production-ready web scrapers that can handle failures, manage resources efficiently, and provide proper control flow.

Understanding Go Context in Web Scraping

Context in Go serves as a control mechanism that carries deadlines, cancellation signals, and request-scoped values across API boundaries. In web scraping, context helps manage the lifecycle of HTTP requests, coordinate concurrent operations, and implement proper timeout handling.

package main

import (
    "context"
    "fmt"
    "net/http"
    "time"
)

// Basic context usage for HTTP requests
func scrapeWithContext(ctx context.Context, url string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    // Process response...
    return nil
}

Essential Context Patterns for Web Scraping

1. Timeout Management

Implement proper timeouts to prevent hanging requests and ensure predictable behavior:

func scrapeWithTimeout(url string, timeout time.Duration) error {
    // Create context with timeout
    ctx, cancel := context.WithTimeout(context.Background(), timeout)
    defer cancel()

    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    client := &http.Client{
        Timeout: timeout, // Additional safety net
    }

    resp, err := client.Do(req)
    if err != nil {
        // Handle timeout errors specifically
        if ctx.Err() == context.DeadlineExceeded {
            return fmt.Errorf("request timed out after %v: %w", timeout, err)
        }
        return err
    }
    defer resp.Body.Close()

    return nil
}

2. Cancellation Handling

Implement proper cancellation for graceful shutdowns and user-initiated stops:

type Scraper struct {
    client *http.Client
    cancel context.CancelFunc
}

func NewScraper() *Scraper {
    return &Scraper{
        client: &http.Client{Timeout: 30 * time.Second},
    }
}

func (s *Scraper) ScrapeURLs(urls []string) error {
    ctx, cancel := context.WithCancel(context.Background())
    s.cancel = cancel

    for _, url := range urls {
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
            if err := s.scrapeURL(ctx, url); err != nil {
                return err
            }
        }
    }

    return nil
}

func (s *Scraper) Stop() {
    if s.cancel != nil {
        s.cancel()
    }
}

func (s *Scraper) scrapeURL(ctx context.Context, url string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    resp, err := s.client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    // Process response with context checking
    return s.processResponse(ctx, resp)
}

func (s *Scraper) processResponse(ctx context.Context, resp *http.Response) error {
    // Check context before expensive operations
    select {
    case <-ctx.Done():
        return ctx.Err()
    default:
        // Process response...
        return nil
    }
}

3. Concurrent Scraping with Context

Manage concurrent operations using context and worker pools:

func concurrentScrape(urls []string, maxWorkers int) error {
    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
    defer cancel()

    // Create buffered channel for URLs
    urlChan := make(chan string, len(urls))
    resultChan := make(chan error, len(urls))

    // Start workers
    for i := 0; i < maxWorkers; i++ {
        go worker(ctx, urlChan, resultChan)
    }

    // Send URLs to workers
    for _, url := range urls {
        select {
        case urlChan <- url:
        case <-ctx.Done():
            return ctx.Err()
        }
    }
    close(urlChan)

    // Collect results
    var errors []error
    for i := 0; i < len(urls); i++ {
        select {
        case err := <-resultChan:
            if err != nil {
                errors = append(errors, err)
            }
        case <-ctx.Done():
            return ctx.Err()
        }
    }

    if len(errors) > 0 {
        return fmt.Errorf("encountered %d errors during scraping", len(errors))
    }

    return nil
}

func worker(ctx context.Context, urls <-chan string, results chan<- error) {
    client := &http.Client{Timeout: 30 * time.Second}

    for {
        select {
        case url, ok := <-urls:
            if !ok {
                return
            }

            err := scrapeURL(ctx, client, url)
            select {
            case results <- err:
            case <-ctx.Done():
                return
            }

        case <-ctx.Done():
            return
        }
    }
}

func scrapeURL(ctx context.Context, client *http.Client, url string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    // Process response...
    return nil
}

Advanced Context Patterns

4. Context Values for Request Metadata

Pass request-specific data through context (use sparingly):

type contextKey string

const (
    userAgentKey contextKey = "user-agent"
    retryCountKey contextKey = "retry-count"
)

func scrapeWithMetadata(ctx context.Context, url string) error {
    // Add metadata to context
    ctx = context.WithValue(ctx, userAgentKey, "MyBot/1.0")
    ctx = context.WithValue(ctx, retryCountKey, 0)

    return performScrape(ctx, url)
}

func performScrape(ctx context.Context, url string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    // Use context values
    if ua, ok := ctx.Value(userAgentKey).(string); ok {
        req.Header.Set("User-Agent", ua)
    }

    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        return handleError(ctx, err, url)
    }
    defer resp.Body.Close()

    return nil
}

func handleError(ctx context.Context, err error, url string) error {
    retryCount, _ := ctx.Value(retryCountKey).(int)

    if retryCount < 3 {
        time.Sleep(time.Duration(retryCount+1) * time.Second)
        newCtx := context.WithValue(ctx, retryCountKey, retryCount+1)
        return performScrape(newCtx, url)
    }

    return fmt.Errorf("failed after %d retries: %w", retryCount, err)
}

5. Context with Rate Limiting

Combine context with rate limiting for responsible scraping:

import (
    "golang.org/x/time/rate"
)

type RateLimitedScraper struct {
    limiter *rate.Limiter
    client  *http.Client
}

func NewRateLimitedScraper(requestsPerSecond int) *RateLimitedScraper {
    return &RateLimitedScraper{
        limiter: rate.NewLimiter(rate.Limit(requestsPerSecond), 1),
        client:  &http.Client{Timeout: 30 * time.Second},
    }
}

func (rs *RateLimitedScraper) Scrape(ctx context.Context, url string) error {
    // Wait for rate limiter with context
    if err := rs.limiter.Wait(ctx); err != nil {
        return fmt.Errorf("rate limiter cancelled: %w", err)
    }

    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    resp, err := rs.client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    return nil
}

Production-Ready Context Management

6. Comprehensive Error Handling

func robustScrape(ctx context.Context, url string) error {
    const maxRetries = 3

    for attempt := 0; attempt <= maxRetries; attempt++ {
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
        }

        err := attemptScrape(ctx, url)
        if err == nil {
            return nil
        }

        // Check if error is retryable
        if !isRetryableError(err) {
            return err
        }

        if attempt < maxRetries {
            backoff := time.Duration(attempt+1) * time.Second
            timer := time.NewTimer(backoff)

            select {
            case <-timer.C:
                // Continue to next attempt
            case <-ctx.Done():
                timer.Stop()
                return ctx.Err()
            }
        }
    }

    return fmt.Errorf("failed after %d attempts", maxRetries)
}

func isRetryableError(err error) bool {
    // Check for timeout, temporary network errors, etc.
    return true // Simplified for example
}

func attemptScrape(ctx context.Context, url string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    client := &http.Client{Timeout: 30 * time.Second}
    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    return nil
}

7. Graceful Shutdown with Context

import (
    "os"
    "os/signal"
    "syscall"
)

func main() {
    // Create root context
    ctx, cancel := context.WithCancel(context.Background())
    defer cancel()

    // Setup graceful shutdown
    go func() {
        sigChan := make(chan os.Signal, 1)
        signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
        <-sigChan

        fmt.Println("Shutting down gracefully...")
        cancel()
    }()

    // Start scraping
    urls := []string{
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3",
    }

    if err := scrapeURLsWithShutdown(ctx, urls); err != nil {
        if err == context.Canceled {
            fmt.Println("Scraping cancelled")
        } else {
            fmt.Printf("Scraping failed: %v\n", err)
        }
    }
}

func scrapeURLsWithShutdown(ctx context.Context, urls []string) error {
    for _, url := range urls {
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
            if err := robustScrape(ctx, url); err != nil {
                return err
            }
        }
    }
    return nil
}

Context Best Practices Summary

  1. Always use context for HTTP requests: Pass context to http.NewRequestWithContext() for proper cancellation and timeout handling.

  2. Set appropriate timeouts: Use context.WithTimeout() for operations that might hang, but choose realistic timeout values.

  3. Check context regularly: In long-running operations, periodically check ctx.Done() to respect cancellation.

  4. Propagate context through call chains: Pass context as the first parameter to functions that perform I/O operations.

  5. Don't store context in structs: Pass context as function parameters instead of storing in struct fields.

  6. Use context values sparingly: Only use context.WithValue() for request-scoped data that crosses API boundaries.

  7. Handle context errors specifically: Distinguish between context.DeadlineExceeded and context.Canceled for appropriate error handling.

  8. Combine with other patterns: Use context with rate limiting, retries, and circuit breakers for robust scraping applications.

Understanding these Go context patterns will help you build web scrapers that are responsive, resource-efficient, and production-ready. Similar timeout and cancellation concepts apply when handling timeouts in Puppeteer for browser-based scraping scenarios.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon