What are Go context best practices for web scraping?
Go's context
package is essential for building robust and efficient web scraping applications. It provides mechanisms for timeout handling, cancellation, request lifecycle management, and graceful shutdowns. Understanding context best practices is crucial for production-ready web scrapers that can handle failures, manage resources efficiently, and provide proper control flow.
Understanding Go Context in Web Scraping
Context in Go serves as a control mechanism that carries deadlines, cancellation signals, and request-scoped values across API boundaries. In web scraping, context helps manage the lifecycle of HTTP requests, coordinate concurrent operations, and implement proper timeout handling.
package main
import (
"context"
"fmt"
"net/http"
"time"
)
// Basic context usage for HTTP requests
func scrapeWithContext(ctx context.Context, url string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
// Process response...
return nil
}
Essential Context Patterns for Web Scraping
1. Timeout Management
Implement proper timeouts to prevent hanging requests and ensure predictable behavior:
func scrapeWithTimeout(url string, timeout time.Duration) error {
// Create context with timeout
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
client := &http.Client{
Timeout: timeout, // Additional safety net
}
resp, err := client.Do(req)
if err != nil {
// Handle timeout errors specifically
if ctx.Err() == context.DeadlineExceeded {
return fmt.Errorf("request timed out after %v: %w", timeout, err)
}
return err
}
defer resp.Body.Close()
return nil
}
2. Cancellation Handling
Implement proper cancellation for graceful shutdowns and user-initiated stops:
type Scraper struct {
client *http.Client
cancel context.CancelFunc
}
func NewScraper() *Scraper {
return &Scraper{
client: &http.Client{Timeout: 30 * time.Second},
}
}
func (s *Scraper) ScrapeURLs(urls []string) error {
ctx, cancel := context.WithCancel(context.Background())
s.cancel = cancel
for _, url := range urls {
select {
case <-ctx.Done():
return ctx.Err()
default:
if err := s.scrapeURL(ctx, url); err != nil {
return err
}
}
}
return nil
}
func (s *Scraper) Stop() {
if s.cancel != nil {
s.cancel()
}
}
func (s *Scraper) scrapeURL(ctx context.Context, url string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
resp, err := s.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
// Process response with context checking
return s.processResponse(ctx, resp)
}
func (s *Scraper) processResponse(ctx context.Context, resp *http.Response) error {
// Check context before expensive operations
select {
case <-ctx.Done():
return ctx.Err()
default:
// Process response...
return nil
}
}
3. Concurrent Scraping with Context
Manage concurrent operations using context and worker pools:
func concurrentScrape(urls []string, maxWorkers int) error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
// Create buffered channel for URLs
urlChan := make(chan string, len(urls))
resultChan := make(chan error, len(urls))
// Start workers
for i := 0; i < maxWorkers; i++ {
go worker(ctx, urlChan, resultChan)
}
// Send URLs to workers
for _, url := range urls {
select {
case urlChan <- url:
case <-ctx.Done():
return ctx.Err()
}
}
close(urlChan)
// Collect results
var errors []error
for i := 0; i < len(urls); i++ {
select {
case err := <-resultChan:
if err != nil {
errors = append(errors, err)
}
case <-ctx.Done():
return ctx.Err()
}
}
if len(errors) > 0 {
return fmt.Errorf("encountered %d errors during scraping", len(errors))
}
return nil
}
func worker(ctx context.Context, urls <-chan string, results chan<- error) {
client := &http.Client{Timeout: 30 * time.Second}
for {
select {
case url, ok := <-urls:
if !ok {
return
}
err := scrapeURL(ctx, client, url)
select {
case results <- err:
case <-ctx.Done():
return
}
case <-ctx.Done():
return
}
}
}
func scrapeURL(ctx context.Context, client *http.Client, url string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
// Process response...
return nil
}
Advanced Context Patterns
4. Context Values for Request Metadata
Pass request-specific data through context (use sparingly):
type contextKey string
const (
userAgentKey contextKey = "user-agent"
retryCountKey contextKey = "retry-count"
)
func scrapeWithMetadata(ctx context.Context, url string) error {
// Add metadata to context
ctx = context.WithValue(ctx, userAgentKey, "MyBot/1.0")
ctx = context.WithValue(ctx, retryCountKey, 0)
return performScrape(ctx, url)
}
func performScrape(ctx context.Context, url string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
// Use context values
if ua, ok := ctx.Value(userAgentKey).(string); ok {
req.Header.Set("User-Agent", ua)
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return handleError(ctx, err, url)
}
defer resp.Body.Close()
return nil
}
func handleError(ctx context.Context, err error, url string) error {
retryCount, _ := ctx.Value(retryCountKey).(int)
if retryCount < 3 {
time.Sleep(time.Duration(retryCount+1) * time.Second)
newCtx := context.WithValue(ctx, retryCountKey, retryCount+1)
return performScrape(newCtx, url)
}
return fmt.Errorf("failed after %d retries: %w", retryCount, err)
}
5. Context with Rate Limiting
Combine context with rate limiting for responsible scraping:
import (
"golang.org/x/time/rate"
)
type RateLimitedScraper struct {
limiter *rate.Limiter
client *http.Client
}
func NewRateLimitedScraper(requestsPerSecond int) *RateLimitedScraper {
return &RateLimitedScraper{
limiter: rate.NewLimiter(rate.Limit(requestsPerSecond), 1),
client: &http.Client{Timeout: 30 * time.Second},
}
}
func (rs *RateLimitedScraper) Scrape(ctx context.Context, url string) error {
// Wait for rate limiter with context
if err := rs.limiter.Wait(ctx); err != nil {
return fmt.Errorf("rate limiter cancelled: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
resp, err := rs.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
return nil
}
Production-Ready Context Management
6. Comprehensive Error Handling
func robustScrape(ctx context.Context, url string) error {
const maxRetries = 3
for attempt := 0; attempt <= maxRetries; attempt++ {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
err := attemptScrape(ctx, url)
if err == nil {
return nil
}
// Check if error is retryable
if !isRetryableError(err) {
return err
}
if attempt < maxRetries {
backoff := time.Duration(attempt+1) * time.Second
timer := time.NewTimer(backoff)
select {
case <-timer.C:
// Continue to next attempt
case <-ctx.Done():
timer.Stop()
return ctx.Err()
}
}
}
return fmt.Errorf("failed after %d attempts", maxRetries)
}
func isRetryableError(err error) bool {
// Check for timeout, temporary network errors, etc.
return true // Simplified for example
}
func attemptScrape(ctx context.Context, url string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
return nil
}
7. Graceful Shutdown with Context
import (
"os"
"os/signal"
"syscall"
)
func main() {
// Create root context
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Setup graceful shutdown
go func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
<-sigChan
fmt.Println("Shutting down gracefully...")
cancel()
}()
// Start scraping
urls := []string{
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
}
if err := scrapeURLsWithShutdown(ctx, urls); err != nil {
if err == context.Canceled {
fmt.Println("Scraping cancelled")
} else {
fmt.Printf("Scraping failed: %v\n", err)
}
}
}
func scrapeURLsWithShutdown(ctx context.Context, urls []string) error {
for _, url := range urls {
select {
case <-ctx.Done():
return ctx.Err()
default:
if err := robustScrape(ctx, url); err != nil {
return err
}
}
}
return nil
}
Context Best Practices Summary
Always use context for HTTP requests: Pass context to
http.NewRequestWithContext()
for proper cancellation and timeout handling.Set appropriate timeouts: Use
context.WithTimeout()
for operations that might hang, but choose realistic timeout values.Check context regularly: In long-running operations, periodically check
ctx.Done()
to respect cancellation.Propagate context through call chains: Pass context as the first parameter to functions that perform I/O operations.
Don't store context in structs: Pass context as function parameters instead of storing in struct fields.
Use context values sparingly: Only use
context.WithValue()
for request-scoped data that crosses API boundaries.Handle context errors specifically: Distinguish between
context.DeadlineExceeded
andcontext.Canceled
for appropriate error handling.Combine with other patterns: Use context with rate limiting, retries, and circuit breakers for robust scraping applications.
Understanding these Go context patterns will help you build web scrapers that are responsive, resource-efficient, and production-ready. Similar timeout and cancellation concepts apply when handling timeouts in Puppeteer for browser-based scraping scenarios.