Table of contents

Can I use proxies with Colly for web scraping?

Yes, Colly fully supports proxy usage for web scraping through its flexible HTTP client configuration. You can configure HTTP, HTTPS, and SOCKS proxies to anonymize your scraping activities, bypass IP blocks, and distribute requests across multiple IP addresses.

Setting Up Basic Proxy Configuration

HTTP/HTTPS Proxy Setup

The most straightforward way to configure a proxy in Colly is by setting the proxy URL directly on the collector:

package main

import (
    "fmt"
    "net/http"
    "net/url"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
)

func main() {
    c := colly.NewCollector(
        colly.Debugger(&debug.LogDebugger{}),
    )

    // Configure HTTP proxy
    proxyURL, err := url.Parse("http://proxy-server:8080")
    if err != nil {
        panic(err)
    }

    c.SetProxy(proxyURL.String())

    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Println("Title:", e.Text)
    })

    c.Visit("https://httpbin.org/ip")
}

Proxy with Authentication

For proxies requiring authentication, include credentials in the proxy URL:

func setupAuthenticatedProxy() {
    c := colly.NewCollector()

    // Proxy with username and password
    proxyURL := "http://username:password@proxy-server:8080"
    c.SetProxy(proxyURL)

    c.OnHTML("body", func(e *colly.HTMLElement) {
        fmt.Println("Response received through authenticated proxy")
    })

    c.Visit("https://httpbin.org/ip")
}

Advanced Proxy Configuration

Custom Transport with Proxy

For more control over proxy settings, configure a custom HTTP transport:

package main

import (
    "crypto/tls"
    "net/http"
    "net/url"
    "time"

    "github.com/gocolly/colly/v2"
)

func setupCustomProxyTransport() {
    c := colly.NewCollector()

    // Parse proxy URL
    proxyURL, _ := url.Parse("http://proxy-server:8080")

    // Create custom transport with proxy
    transport := &http.Transport{
        Proxy: http.ProxyURL(proxyURL),
        TLSClientConfig: &tls.Config{
            InsecureSkipVerify: false, // Set to true for self-signed certificates
        },
        DisableKeepAlives: false,
        IdleConnTimeout:   30 * time.Second,
        DisableCompression: false,
    }

    // Set custom transport
    c.OnRequest(func(r *colly.Request) {
        r.Headers.Set("User-Agent", "Custom Colly Bot")
    })

    // Apply transport to collector
    c.SetClient(&http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    })

    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Println("Title:", e.Text)
    })

    c.Visit("https://example.com")
}

SOCKS Proxy Support

Colly supports SOCKS proxies through Go's extended networking packages:

package main

import (
    "context"
    "net"
    "net/http"
    "time"

    "github.com/gocolly/colly/v2"
    "golang.org/x/net/proxy"
)

func setupSOCKSProxy() {
    c := colly.NewCollector()

    // Create SOCKS5 dialer
    dialer, err := proxy.SOCKS5("tcp", "127.0.0.1:1080", nil, proxy.Direct)
    if err != nil {
        panic(err)
    }

    // Create custom transport with SOCKS proxy
    transport := &http.Transport{
        DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
            return dialer.Dial(network, addr)
        },
        DisableKeepAlives: false,
        IdleConnTimeout:   30 * time.Second,
    }

    // Set custom client with SOCKS transport
    c.SetClient(&http.Client{
        Transport: transport,
        Timeout:   60 * time.Second,
    })

    c.OnHTML("body", func(e *colly.HTMLElement) {
        fmt.Println("Content received through SOCKS proxy")
    })

    c.Visit("https://httpbin.org/ip")
}

Proxy Rotation and Management

Simple Proxy Rotation

Implement basic proxy rotation by cycling through a list of proxy servers:

package main

import (
    "fmt"
    "math/rand"
    "net/url"
    "time"

    "github.com/gocolly/colly/v2"
)

type ProxyRotator struct {
    proxies []string
    current int
}

func NewProxyRotator(proxies []string) *ProxyRotator {
    rand.Seed(time.Now().UnixNano())
    return &ProxyRotator{
        proxies: proxies,
        current: 0,
    }
}

func (pr *ProxyRotator) GetNext() string {
    if len(pr.proxies) == 0 {
        return ""
    }
    proxy := pr.proxies[pr.current]
    pr.current = (pr.current + 1) % len(pr.proxies)
    return proxy
}

func (pr *ProxyRotator) GetRandom() string {
    if len(pr.proxies) == 0 {
        return ""
    }
    return pr.proxies[rand.Intn(len(pr.proxies))]
}

func proxyRotationExample() {
    // List of proxy servers
    proxies := []string{
        "http://proxy1.example.com:8080",
        "http://proxy2.example.com:8080",
        "http://proxy3.example.com:8080",
    }

    rotator := NewProxyRotator(proxies)

    c := colly.NewCollector()

    // Rotate proxy before each request
    c.OnRequest(func(r *colly.Request) {
        proxyURL := rotator.GetNext()
        if proxyURL != "" {
            if parsedURL, err := url.Parse(proxyURL); err == nil {
                c.SetProxy(parsedURL.String())
                fmt.Printf("Using proxy: %s\n", proxyURL)
            }
        }
    })

    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Printf("Title from %s: %s\n", e.Request.URL, e.Text)
    })

    // Make multiple requests with different proxies
    urls := []string{
        "https://httpbin.org/ip",
        "https://httpbin.org/user-agent",
        "https://httpbin.org/headers",
    }

    for _, targetURL := range urls {
        c.Visit(targetURL)
    }
}

Advanced Proxy Manager

Create a more sophisticated proxy manager with health checking and automatic failover:

package main

import (
    "fmt"
    "net/http"
    "net/url"
    "sync"
    "time"

    "github.com/gocolly/colly/v2"
)

type ProxyManager struct {
    proxies     []ProxyInfo
    healthCheck string
    mu          sync.RWMutex
}

type ProxyInfo struct {
    URL       string
    IsHealthy bool
    LastCheck time.Time
    FailCount int
}

func NewProxyManager(proxyURLs []string) *ProxyManager {
    proxies := make([]ProxyInfo, len(proxyURLs))
    for i, proxyURL := range proxyURLs {
        proxies[i] = ProxyInfo{
            URL:       proxyURL,
            IsHealthy: true,
            LastCheck: time.Now(),
        }
    }

    return &ProxyManager{
        proxies:     proxies,
        healthCheck: "https://httpbin.org/ip",
    }
}

func (pm *ProxyManager) GetHealthyProxy() string {
    pm.mu.RLock()
    defer pm.mu.RUnlock()

    for _, proxy := range pm.proxies {
        if proxy.IsHealthy && proxy.FailCount < 3 {
            return proxy.URL
        }
    }
    return ""
}

func (pm *ProxyManager) MarkProxyFailed(proxyURL string) {
    pm.mu.Lock()
    defer pm.mu.Unlock()

    for i := range pm.proxies {
        if pm.proxies[i].URL == proxyURL {
            pm.proxies[i].FailCount++
            if pm.proxies[i].FailCount >= 3 {
                pm.proxies[i].IsHealthy = false
            }
            break
        }
    }
}

func (pm *ProxyManager) CheckProxyHealth(proxyURL string) bool {
    client := &http.Client{
        Timeout: 10 * time.Second,
    }

    if parsedURL, err := url.Parse(proxyURL); err == nil {
        client.Transport = &http.Transport{
            Proxy: http.ProxyURL(parsedURL),
        }
    }

    resp, err := client.Get(pm.healthCheck)
    if err != nil {
        return false
    }
    defer resp.Body.Close()

    return resp.StatusCode == 200
}

func advancedProxyExample() {
    proxies := []string{
        "http://proxy1.example.com:8080",
        "http://proxy2.example.com:8080",
        "http://proxy3.example.com:8080",
    }

    proxyManager := NewProxyManager(proxies)

    c := colly.NewCollector()

    c.OnRequest(func(r *colly.Request) {
        proxyURL := proxyManager.GetHealthyProxy()
        if proxyURL != "" {
            c.SetProxy(proxyURL)
            fmt.Printf("Using proxy: %s\n", proxyURL)
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        // Mark proxy as failed if request fails
        if r.Request.ProxyURL != nil {
            proxyManager.MarkProxyFailed(r.Request.ProxyURL.String())
        }
        fmt.Printf("Request failed: %v\n", err)
    })

    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Printf("Success: %s\n", e.Text)
    })

    c.Visit("https://example.com")
}

Best Practices for Proxy Usage

1. Proxy Validation and Testing

Always test your proxies before using them in production:

func validateProxy(proxyURL string) bool {
    client := &http.Client{
        Timeout: 10 * time.Second,
    }

    if parsedURL, err := url.Parse(proxyURL); err == nil {
        client.Transport = &http.Transport{
            Proxy: http.ProxyURL(parsedURL),
        }
    } else {
        return false
    }

    resp, err := client.Get("https://httpbin.org/ip")
    if err != nil {
        return false
    }
    defer resp.Body.Close()

    return resp.StatusCode == 200
}

2. Handling Proxy Errors

Implement proper error handling for proxy-related issues:

c.OnError(func(r *colly.Response, err error) {
    fmt.Printf("Request to %s failed: %v\n", r.Request.URL, err)

    // Check if it's a proxy-related error
    if r.StatusCode == 407 { // Proxy Authentication Required
        fmt.Println("Proxy authentication failed")
    } else if r.StatusCode == 503 { // Service Unavailable
        fmt.Println("Proxy server unavailable")
    }

    // Implement retry logic or proxy switching here
})

3. Rate Limiting with Proxies

Even when using proxies, implement rate limiting to avoid overwhelming target servers:

c.Limit(&colly.LimitRule{
    DomainGlob:  "*",
    Parallelism: 2,
    Delay:       1 * time.Second,
})

Security Considerations

When using proxies for web scraping, consider these security aspects:

  1. Proxy Trust: Only use trusted proxy providers to avoid data interception
  2. HTTPS Verification: Be cautious with InsecureSkipVerify settings
  3. Credential Management: Store proxy credentials securely, not in source code
  4. Traffic Monitoring: Be aware that proxy providers can see your traffic

Integration with Cloud Proxy Services

Many developers use cloud-based proxy services. Here's how to integrate them with Colly:

func setupCloudProxy() {
    c := colly.NewCollector()

    // Example for a cloud proxy service
    proxyURL := "http://username:password@rotating-residential.example.com:8000"
    c.SetProxy(proxyURL)

    // Set appropriate headers for cloud proxy services
    c.OnRequest(func(r *colly.Request) {
        r.Headers.Set("User-Agent", "Mozilla/5.0 (compatible; Bot)")
        // Some services require specific headers
        r.Headers.Set("X-Proxy-Session", "session_123")
    })

    c.Visit("https://target-website.com")
}

Conclusion

Colly's proxy support is robust and flexible, allowing you to implement everything from simple proxy usage to sophisticated proxy rotation systems. Whether you're dealing with IP blocks, need to distribute load, or require anonymity for your scraping operations, Colly's proxy capabilities can meet your needs.

For more advanced scraping scenarios involving JavaScript-heavy sites, you might also want to explore how to handle browser sessions in Puppeteer or learn about handling AJAX requests using Puppeteer for sites that require full browser automation.

Remember to always respect robots.txt files, implement appropriate delays, and follow the terms of service of the websites you're scraping, regardless of whether you're using proxies or not.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon