Can I use proxies with Colly for web scraping?
Yes, Colly fully supports proxy usage for web scraping through its flexible HTTP client configuration. You can configure HTTP, HTTPS, and SOCKS proxies to anonymize your scraping activities, bypass IP blocks, and distribute requests across multiple IP addresses.
Setting Up Basic Proxy Configuration
HTTP/HTTPS Proxy Setup
The most straightforward way to configure a proxy in Colly is by setting the proxy URL directly on the collector:
package main
import (
"fmt"
"net/http"
"net/url"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
c := colly.NewCollector(
colly.Debugger(&debug.LogDebugger{}),
)
// Configure HTTP proxy
proxyURL, err := url.Parse("http://proxy-server:8080")
if err != nil {
panic(err)
}
c.SetProxy(proxyURL.String())
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println("Title:", e.Text)
})
c.Visit("https://httpbin.org/ip")
}
Proxy with Authentication
For proxies requiring authentication, include credentials in the proxy URL:
func setupAuthenticatedProxy() {
c := colly.NewCollector()
// Proxy with username and password
proxyURL := "http://username:password@proxy-server:8080"
c.SetProxy(proxyURL)
c.OnHTML("body", func(e *colly.HTMLElement) {
fmt.Println("Response received through authenticated proxy")
})
c.Visit("https://httpbin.org/ip")
}
Advanced Proxy Configuration
Custom Transport with Proxy
For more control over proxy settings, configure a custom HTTP transport:
package main
import (
"crypto/tls"
"net/http"
"net/url"
"time"
"github.com/gocolly/colly/v2"
)
func setupCustomProxyTransport() {
c := colly.NewCollector()
// Parse proxy URL
proxyURL, _ := url.Parse("http://proxy-server:8080")
// Create custom transport with proxy
transport := &http.Transport{
Proxy: http.ProxyURL(proxyURL),
TLSClientConfig: &tls.Config{
InsecureSkipVerify: false, // Set to true for self-signed certificates
},
DisableKeepAlives: false,
IdleConnTimeout: 30 * time.Second,
DisableCompression: false,
}
// Set custom transport
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", "Custom Colly Bot")
})
// Apply transport to collector
c.SetClient(&http.Client{
Transport: transport,
Timeout: 60 * time.Second,
})
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println("Title:", e.Text)
})
c.Visit("https://example.com")
}
SOCKS Proxy Support
Colly supports SOCKS proxies through Go's extended networking packages:
package main
import (
"context"
"net"
"net/http"
"time"
"github.com/gocolly/colly/v2"
"golang.org/x/net/proxy"
)
func setupSOCKSProxy() {
c := colly.NewCollector()
// Create SOCKS5 dialer
dialer, err := proxy.SOCKS5("tcp", "127.0.0.1:1080", nil, proxy.Direct)
if err != nil {
panic(err)
}
// Create custom transport with SOCKS proxy
transport := &http.Transport{
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
return dialer.Dial(network, addr)
},
DisableKeepAlives: false,
IdleConnTimeout: 30 * time.Second,
}
// Set custom client with SOCKS transport
c.SetClient(&http.Client{
Transport: transport,
Timeout: 60 * time.Second,
})
c.OnHTML("body", func(e *colly.HTMLElement) {
fmt.Println("Content received through SOCKS proxy")
})
c.Visit("https://httpbin.org/ip")
}
Proxy Rotation and Management
Simple Proxy Rotation
Implement basic proxy rotation by cycling through a list of proxy servers:
package main
import (
"fmt"
"math/rand"
"net/url"
"time"
"github.com/gocolly/colly/v2"
)
type ProxyRotator struct {
proxies []string
current int
}
func NewProxyRotator(proxies []string) *ProxyRotator {
rand.Seed(time.Now().UnixNano())
return &ProxyRotator{
proxies: proxies,
current: 0,
}
}
func (pr *ProxyRotator) GetNext() string {
if len(pr.proxies) == 0 {
return ""
}
proxy := pr.proxies[pr.current]
pr.current = (pr.current + 1) % len(pr.proxies)
return proxy
}
func (pr *ProxyRotator) GetRandom() string {
if len(pr.proxies) == 0 {
return ""
}
return pr.proxies[rand.Intn(len(pr.proxies))]
}
func proxyRotationExample() {
// List of proxy servers
proxies := []string{
"http://proxy1.example.com:8080",
"http://proxy2.example.com:8080",
"http://proxy3.example.com:8080",
}
rotator := NewProxyRotator(proxies)
c := colly.NewCollector()
// Rotate proxy before each request
c.OnRequest(func(r *colly.Request) {
proxyURL := rotator.GetNext()
if proxyURL != "" {
if parsedURL, err := url.Parse(proxyURL); err == nil {
c.SetProxy(parsedURL.String())
fmt.Printf("Using proxy: %s\n", proxyURL)
}
}
})
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Printf("Title from %s: %s\n", e.Request.URL, e.Text)
})
// Make multiple requests with different proxies
urls := []string{
"https://httpbin.org/ip",
"https://httpbin.org/user-agent",
"https://httpbin.org/headers",
}
for _, targetURL := range urls {
c.Visit(targetURL)
}
}
Advanced Proxy Manager
Create a more sophisticated proxy manager with health checking and automatic failover:
package main
import (
"fmt"
"net/http"
"net/url"
"sync"
"time"
"github.com/gocolly/colly/v2"
)
type ProxyManager struct {
proxies []ProxyInfo
healthCheck string
mu sync.RWMutex
}
type ProxyInfo struct {
URL string
IsHealthy bool
LastCheck time.Time
FailCount int
}
func NewProxyManager(proxyURLs []string) *ProxyManager {
proxies := make([]ProxyInfo, len(proxyURLs))
for i, proxyURL := range proxyURLs {
proxies[i] = ProxyInfo{
URL: proxyURL,
IsHealthy: true,
LastCheck: time.Now(),
}
}
return &ProxyManager{
proxies: proxies,
healthCheck: "https://httpbin.org/ip",
}
}
func (pm *ProxyManager) GetHealthyProxy() string {
pm.mu.RLock()
defer pm.mu.RUnlock()
for _, proxy := range pm.proxies {
if proxy.IsHealthy && proxy.FailCount < 3 {
return proxy.URL
}
}
return ""
}
func (pm *ProxyManager) MarkProxyFailed(proxyURL string) {
pm.mu.Lock()
defer pm.mu.Unlock()
for i := range pm.proxies {
if pm.proxies[i].URL == proxyURL {
pm.proxies[i].FailCount++
if pm.proxies[i].FailCount >= 3 {
pm.proxies[i].IsHealthy = false
}
break
}
}
}
func (pm *ProxyManager) CheckProxyHealth(proxyURL string) bool {
client := &http.Client{
Timeout: 10 * time.Second,
}
if parsedURL, err := url.Parse(proxyURL); err == nil {
client.Transport = &http.Transport{
Proxy: http.ProxyURL(parsedURL),
}
}
resp, err := client.Get(pm.healthCheck)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == 200
}
func advancedProxyExample() {
proxies := []string{
"http://proxy1.example.com:8080",
"http://proxy2.example.com:8080",
"http://proxy3.example.com:8080",
}
proxyManager := NewProxyManager(proxies)
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
proxyURL := proxyManager.GetHealthyProxy()
if proxyURL != "" {
c.SetProxy(proxyURL)
fmt.Printf("Using proxy: %s\n", proxyURL)
}
})
c.OnError(func(r *colly.Response, err error) {
// Mark proxy as failed if request fails
if r.Request.ProxyURL != nil {
proxyManager.MarkProxyFailed(r.Request.ProxyURL.String())
}
fmt.Printf("Request failed: %v\n", err)
})
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Printf("Success: %s\n", e.Text)
})
c.Visit("https://example.com")
}
Best Practices for Proxy Usage
1. Proxy Validation and Testing
Always test your proxies before using them in production:
func validateProxy(proxyURL string) bool {
client := &http.Client{
Timeout: 10 * time.Second,
}
if parsedURL, err := url.Parse(proxyURL); err == nil {
client.Transport = &http.Transport{
Proxy: http.ProxyURL(parsedURL),
}
} else {
return false
}
resp, err := client.Get("https://httpbin.org/ip")
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == 200
}
2. Handling Proxy Errors
Implement proper error handling for proxy-related issues:
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Request to %s failed: %v\n", r.Request.URL, err)
// Check if it's a proxy-related error
if r.StatusCode == 407 { // Proxy Authentication Required
fmt.Println("Proxy authentication failed")
} else if r.StatusCode == 503 { // Service Unavailable
fmt.Println("Proxy server unavailable")
}
// Implement retry logic or proxy switching here
})
3. Rate Limiting with Proxies
Even when using proxies, implement rate limiting to avoid overwhelming target servers:
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 2,
Delay: 1 * time.Second,
})
Security Considerations
When using proxies for web scraping, consider these security aspects:
- Proxy Trust: Only use trusted proxy providers to avoid data interception
- HTTPS Verification: Be cautious with
InsecureSkipVerify
settings - Credential Management: Store proxy credentials securely, not in source code
- Traffic Monitoring: Be aware that proxy providers can see your traffic
Integration with Cloud Proxy Services
Many developers use cloud-based proxy services. Here's how to integrate them with Colly:
func setupCloudProxy() {
c := colly.NewCollector()
// Example for a cloud proxy service
proxyURL := "http://username:password@rotating-residential.example.com:8000"
c.SetProxy(proxyURL)
// Set appropriate headers for cloud proxy services
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", "Mozilla/5.0 (compatible; Bot)")
// Some services require specific headers
r.Headers.Set("X-Proxy-Session", "session_123")
})
c.Visit("https://target-website.com")
}
Conclusion
Colly's proxy support is robust and flexible, allowing you to implement everything from simple proxy usage to sophisticated proxy rotation systems. Whether you're dealing with IP blocks, need to distribute load, or require anonymity for your scraping operations, Colly's proxy capabilities can meet your needs.
For more advanced scraping scenarios involving JavaScript-heavy sites, you might also want to explore how to handle browser sessions in Puppeteer or learn about handling AJAX requests using Puppeteer for sites that require full browser automation.
Remember to always respect robots.txt files, implement appropriate delays, and follow the terms of service of the websites you're scraping, regardless of whether you're using proxies or not.