Table of contents

How do I handle proxy servers in Go web scraping?

Proxy servers are essential tools in web scraping for anonymity, bypassing geographic restrictions, and avoiding IP-based rate limiting. Go provides excellent built-in support for HTTP proxies through its net/http package, making it straightforward to implement proxy functionality in your web scraping applications.

Basic Proxy Configuration

The most fundamental way to use a proxy in Go is by configuring the HTTP client's transport. Here's a basic example:

package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "net/url"
    "time"
)

func main() {
    // Parse proxy URL
    proxyURL, err := url.Parse("http://proxy.example.com:8080")
    if err != nil {
        panic(err)
    }

    // Create a custom transport with proxy
    transport := &http.Transport{
        Proxy: http.ProxyURL(proxyURL),
        // Add timeout settings
        ResponseHeaderTimeout: 30 * time.Second,
        IdleConnTimeout:       90 * time.Second,
    }

    // Create HTTP client with proxy transport
    client := &http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    }

    // Make request through proxy
    resp, err := client.Get("https://httpbin.org/ip")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}

Proxy Authentication

Many proxy servers require authentication. Go supports both basic authentication and more advanced authentication methods:

package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "net/url"
    "time"
)

func createAuthenticatedProxy(proxyURL, username, password string) (*http.Client, error) {
    // Parse the proxy URL
    parsedURL, err := url.Parse(proxyURL)
    if err != nil {
        return nil, err
    }

    // Set authentication credentials
    parsedURL.User = url.UserPassword(username, password)

    // Create transport with authenticated proxy
    transport := &http.Transport{
        Proxy:                 http.ProxyURL(parsedURL),
        ResponseHeaderTimeout: 30 * time.Second,
        IdleConnTimeout:       90 * time.Second,
        MaxIdleConns:          100,
        MaxIdleConnsPerHost:   10,
    }

    return &http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    }, nil
}

func main() {
    client, err := createAuthenticatedProxy(
        "http://proxy.example.com:8080",
        "username",
        "password",
    )
    if err != nil {
        panic(err)
    }

    resp, err := client.Get("https://httpbin.org/headers")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}

SOCKS5 Proxy Support

For SOCKS5 proxies, you'll need to use a third-party package like golang.org/x/net/proxy:

go get golang.org/x/net/proxy
package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "net/url"
    "time"

    "golang.org/x/net/proxy"
)

func createSOCKS5Client(proxyAddr, username, password string) (*http.Client, error) {
    // Create SOCKS5 dialer
    var auth *proxy.Auth
    if username != "" && password != "" {
        auth = &proxy.Auth{
            User:     username,
            Password: password,
        }
    }

    dialer, err := proxy.SOCKS5("tcp", proxyAddr, auth, proxy.Direct)
    if err != nil {
        return nil, err
    }

    // Create transport with SOCKS5 dialer
    transport := &http.Transport{
        Dial:                  dialer.Dial,
        ResponseHeaderTimeout: 30 * time.Second,
        IdleConnTimeout:       90 * time.Second,
    }

    return &http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    }, nil
}

func main() {
    client, err := createSOCKS5Client("127.0.0.1:1080", "user", "pass")
    if err != nil {
        panic(err)
    }

    resp, err := client.Get("https://httpbin.org/ip")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}

Proxy Rotation

For large-scale scraping operations, rotating between multiple proxies helps distribute load and avoid detection:

package main

import (
    "fmt"
    "io/ioutil"
    "math/rand"
    "net/http"
    "net/url"
    "sync"
    "time"
)

type ProxyRotator struct {
    proxies []string
    current int
    mutex   sync.Mutex
}

func NewProxyRotator(proxies []string) *ProxyRotator {
    return &ProxyRotator{
        proxies: proxies,
        current: 0,
    }
}

func (pr *ProxyRotator) GetNext() string {
    pr.mutex.Lock()
    defer pr.mutex.Unlock()

    proxy := pr.proxies[pr.current]
    pr.current = (pr.current + 1) % len(pr.proxies)
    return proxy
}

func (pr *ProxyRotator) GetRandom() string {
    pr.mutex.Lock()
    defer pr.mutex.Unlock()

    index := rand.Intn(len(pr.proxies))
    return pr.proxies[index]
}

func (pr *ProxyRotator) CreateClient() (*http.Client, error) {
    proxyURL := pr.GetNext()
    parsedURL, err := url.Parse(proxyURL)
    if err != nil {
        return nil, err
    }

    transport := &http.Transport{
        Proxy:                 http.ProxyURL(parsedURL),
        ResponseHeaderTimeout: 30 * time.Second,
        IdleConnTimeout:       90 * time.Second,
        MaxIdleConns:          100,
    }

    return &http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    }, nil
}

func main() {
    proxies := []string{
        "http://proxy1.example.com:8080",
        "http://proxy2.example.com:8080",
        "http://proxy3.example.com:8080",
    }

    rotator := NewProxyRotator(proxies)

    // Make multiple requests with different proxies
    for i := 0; i < 5; i++ {
        client, err := rotator.CreateClient()
        if err != nil {
            fmt.Printf("Error creating client: %v\n", err)
            continue
        }

        resp, err := client.Get("https://httpbin.org/ip")
        if err != nil {
            fmt.Printf("Request failed: %v\n", err)
            continue
        }

        body, err := ioutil.ReadAll(resp.Body)
        resp.Body.Close()
        if err != nil {
            fmt.Printf("Error reading response: %v\n", err)
            continue
        }

        fmt.Printf("Request %d: %s\n", i+1, string(body))
        time.Sleep(1 * time.Second)
    }
}

Environment-Based Proxy Configuration

For production deployments, it's common to configure proxies through environment variables:

package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "net/url"
    "os"
    "time"
)

func createClientFromEnv() (*http.Client, error) {
    var transport *http.Transport

    // Check for proxy environment variables
    httpProxy := os.Getenv("HTTP_PROXY")
    httpsProxy := os.Getenv("HTTPS_PROXY")
    noProxy := os.Getenv("NO_PROXY")

    if httpProxy != "" || httpsProxy != "" {
        // Parse proxy URLs
        var proxyFunc func(*http.Request) (*url.URL, error)

        if httpsProxy != "" {
            proxyURL, err := url.Parse(httpsProxy)
            if err != nil {
                return nil, fmt.Errorf("invalid HTTPS_PROXY: %v", err)
            }
            proxyFunc = http.ProxyURL(proxyURL)
        } else if httpProxy != "" {
            proxyURL, err := url.Parse(httpProxy)
            if err != nil {
                return nil, fmt.Errorf("invalid HTTP_PROXY: %v", err)
            }
            proxyFunc = http.ProxyURL(proxyURL)
        }

        transport = &http.Transport{
            Proxy:                 proxyFunc,
            ResponseHeaderTimeout: 30 * time.Second,
            IdleConnTimeout:       90 * time.Second,
        }

        // Handle NO_PROXY if specified
        if noProxy != "" {
            transport.Proxy = func(req *http.Request) (*url.URL, error) {
                // Simple NO_PROXY implementation
                // In production, you'd want more sophisticated parsing
                if req.URL.Host == noProxy {
                    return nil, nil
                }
                return proxyFunc(req)
            }
        }
    } else {
        // Use default transport
        transport = &http.Transport{
            ResponseHeaderTimeout: 30 * time.Second,
            IdleConnTimeout:       90 * time.Second,
        }
    }

    return &http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    }, nil
}

func main() {
    // Set environment variables (in practice, these would be set externally)
    os.Setenv("HTTPS_PROXY", "http://proxy.example.com:8080")

    client, err := createClientFromEnv()
    if err != nil {
        panic(err)
    }

    resp, err := client.Get("https://httpbin.org/ip")
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println(string(body))
}

Error Handling and Retry Logic

When working with proxies, implementing robust error handling and retry mechanisms is crucial, especially when dealing with unreliable proxy servers:

package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "net/url"
    "time"
)

type ProxyClient struct {
    clients []*http.Client
    current int
}

func NewProxyClient(proxies []string) (*ProxyClient, error) {
    var clients []*http.Client

    for _, proxyURL := range proxies {
        parsedURL, err := url.Parse(proxyURL)
        if err != nil {
            return nil, fmt.Errorf("invalid proxy URL %s: %v", proxyURL, err)
        }

        transport := &http.Transport{
            Proxy:                 http.ProxyURL(parsedURL),
            ResponseHeaderTimeout: 30 * time.Second,
            IdleConnTimeout:       90 * time.Second,
        }

        client := &http.Client{
            Transport: transport,
            Timeout:   60 * time.Second,
        }

        clients = append(clients, client)
    }

    return &ProxyClient{
        clients: clients,
        current: 0,
    }, nil
}

func (pc *ProxyClient) GetWithRetry(url string, maxRetries int) (*http.Response, error) {
    var lastErr error

    for attempt := 0; attempt < maxRetries; attempt++ {
        client := pc.clients[pc.current]
        pc.current = (pc.current + 1) % len(pc.clients)

        resp, err := client.Get(url)
        if err == nil && resp.StatusCode < 400 {
            return resp, nil
        }

        if err != nil {
            lastErr = err
            fmt.Printf("Attempt %d failed with error: %v\n", attempt+1, err)
        } else {
            resp.Body.Close()
            lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
            fmt.Printf("Attempt %d failed with status: %d\n", attempt+1, resp.StatusCode)
        }

        // Exponential backoff
        if attempt < maxRetries-1 {
            backoff := time.Duration(1<<uint(attempt)) * time.Second
            fmt.Printf("Waiting %v before retry...\n", backoff)
            time.Sleep(backoff)
        }
    }

    return nil, fmt.Errorf("all %d attempts failed, last error: %v", maxRetries, lastErr)
}

func main() {
    proxies := []string{
        "http://proxy1.example.com:8080",
        "http://proxy2.example.com:8080",
        "http://proxy3.example.com:8080",
    }

    proxyClient, err := NewProxyClient(proxies)
    if err != nil {
        panic(err)
    }

    resp, err := proxyClient.GetWithRetry("https://httpbin.org/ip", 3)
    if err != nil {
        panic(err)
    }
    defer resp.Body.Close()

    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        panic(err)
    }

    fmt.Println("Success:", string(body))
}

Testing Proxy Connectivity

Before using proxies in production, it's important to test their connectivity and performance:

# Test proxy connectivity
curl --proxy http://proxy.example.com:8080 https://httpbin.org/ip

# Test with authentication
curl --proxy-user username:password --proxy http://proxy.example.com:8080 https://httpbin.org/ip

# Test SOCKS5 proxy
curl --socks5 proxy.example.com:1080 https://httpbin.org/ip

Advanced Proxy Features

Connection Pooling and Performance

For high-throughput applications, proper connection pooling configuration is essential:

func createOptimizedProxy(proxyURL string) (*http.Client, error) {
    parsedURL, err := url.Parse(proxyURL)
    if err != nil {
        return nil, err
    }

    transport := &http.Transport{
        Proxy:               http.ProxyURL(parsedURL),
        MaxIdleConns:        100,
        MaxIdleConnsPerHost: 20,
        IdleConnTimeout:     90 * time.Second,
        TLSHandshakeTimeout: 10 * time.Second,
        DisableKeepAlives:   false,
        // Enable HTTP/2 support
        ForceAttemptHTTP2: true,
    }

    return &http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    }, nil
}

Proxy Health Monitoring

Implement health checks to ensure proxy reliability:

package main

import (
    "context"
    "fmt"
    "net/http"
    "net/url"
    "sync"
    "time"
)

type HealthyProxy struct {
    URL       string
    LastCheck time.Time
    IsHealthy bool
    ErrorCount int
}

type ProxyManager struct {
    proxies []HealthyProxy
    mutex   sync.RWMutex
}

func (pm *ProxyManager) checkHealth(proxy *HealthyProxy) {
    client, err := createAuthenticatedProxy(proxy.URL, "", "")
    if err != nil {
        proxy.IsHealthy = false
        proxy.ErrorCount++
        return
    }

    ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
    defer cancel()

    req, _ := http.NewRequestWithContext(ctx, "GET", "https://httpbin.org/ip", nil)
    resp, err := client.Do(req)

    if err != nil || resp.StatusCode >= 400 {
        proxy.IsHealthy = false
        proxy.ErrorCount++
    } else {
        proxy.IsHealthy = true
        proxy.ErrorCount = 0
        resp.Body.Close()
    }

    proxy.LastCheck = time.Now()
}

func (pm *ProxyManager) GetHealthyProxy() *HealthyProxy {
    pm.mutex.RLock()
    defer pm.mutex.RUnlock()

    for i := range pm.proxies {
        if pm.proxies[i].IsHealthy {
            return &pm.proxies[i]
        }
    }
    return nil
}

Best Practices

  1. Connection Pooling: Configure MaxIdleConns and MaxIdleConnsPerHost to optimize connection reuse
  2. Timeout Configuration: Set appropriate timeouts for proxy connections to avoid hanging requests
  3. Health Checks: Regularly test proxy availability and remove failed proxies from rotation
  4. Rate Limiting: Implement rate limiting to avoid overwhelming proxy servers
  5. User-Agent Rotation: Combine proxy rotation with user-agent rotation for better anonymity
  6. Error Monitoring: Log proxy failures and monitor success rates to identify problematic proxies

Integration with Web Scraping Frameworks

When using Go web scraping libraries like Colly, you can easily integrate proxy support:

package main

import (
    "fmt"
    "net/http"
    "net/url"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
)

func main() {
    c := colly.NewCollector(
        colly.Debugger(&debug.LogDebugger{}),
    )

    // Configure proxy
    proxyURL, _ := url.Parse("http://proxy.example.com:8080")
    c.SetProxyFunc(http.ProxyURL(proxyURL))

    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Println("Title:", e.Text)
    })

    c.Visit("https://example.com")
}

WebScraping.AI Integration

For developers who prefer managed solutions, WebScraping.AI's proxy infrastructure handles proxy rotation, authentication, and geographic distribution automatically. This approach eliminates the need to maintain your own proxy infrastructure while providing enterprise-grade reliability and performance.

Conclusion

Proxy servers are invaluable for Go web scraping applications, providing anonymity, geographic flexibility, and load distribution. By implementing proper proxy handling with authentication, rotation, and error recovery, you can build robust and scalable web scraping solutions that can handle the most challenging scraping scenarios while maintaining reliability and performance.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon