How do I handle redirects in Colly?

Colly is a powerful web scraping framework for Go that provides built-in redirect handling capabilities. By default, Colly automatically follows up to 10 redirects, but you can customize this behavior for different scraping scenarios.

Default Redirect Behavior

Colly handles redirects automatically following standard HTTP redirect status codes (301, 302, 303, 307, 308). Here's a basic example:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting:", r.URL)
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Printf("Final URL: %s (Status: %d)\n", r.Request.URL, r.StatusCode)
    })

    c.Visit("https://httpbin.org/redirect/3")
}

1. Configure Maximum Redirects

Control the number of redirects Colly will follow to prevent excessive redirections:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

func main() {
    // Limit redirects to 3
    c := colly.NewCollector()
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 2,
    })

    // Set maximum redirects
    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        if len(via) >= 3 {
            return fmt.Errorf("too many redirects")
        }
        return nil
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Requesting:", r.URL)
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Printf("Success: %s\n", r.Request.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Printf("Error: %s - %v\n", r.Request.URL, err)
    })

    c.Visit("https://httpbin.org/redirect/5") // Will fail after 3 redirects
}

2. Disable Redirect Following

To capture redirect responses without following them:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "net/http"
)

func main() {
    c := colly.NewCollector()

    // Disable automatic redirects
    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        return http.ErrUseLastResponse
    })

    c.OnResponse(func(r *colly.Response) {
        if r.StatusCode >= 300 && r.StatusCode < 400 {
            location := r.Headers.Get("Location")
            fmt.Printf("Redirect detected: %d -> %s\n", r.StatusCode, location)
        } else {
            fmt.Printf("Final response: %d\n", r.StatusCode)
        }
    })

    c.Visit("https://httpbin.org/redirect/1")
}

3. Manual Redirect Handling with Loop Prevention

For complete control over redirects with safety mechanisms:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "net/http"
    "net/url"
    "strings"
)

type RedirectTracker struct {
    visited     map[string]bool
    redirects   int
    maxRedirects int
}

func NewRedirectTracker(maxRedirects int) *RedirectTracker {
    return &RedirectTracker{
        visited:     make(map[string]bool),
        maxRedirects: maxRedirects,
    }
}

func (rt *RedirectTracker) ShouldFollow(currentURL, redirectURL string) bool {
    // Check for infinite loops
    if rt.visited[redirectURL] {
        fmt.Printf("Loop detected: %s already visited\n", redirectURL)
        return false
    }

    // Check redirect limit
    if rt.redirects >= rt.maxRedirects {
        fmt.Printf("Max redirects (%d) reached\n", rt.maxRedirects)
        return false
    }

    return true
}

func main() {
    c := colly.NewCollector()
    tracker := NewRedirectTracker(5)

    // Disable automatic redirects
    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        return http.ErrUseLastResponse
    })

    c.OnResponse(func(r *colly.Response) {
        currentURL := r.Request.URL.String()
        tracker.visited[currentURL] = true

        if r.StatusCode >= 300 && r.StatusCode < 400 {
            location := r.Headers.Get("Location")
            if location == "" {
                fmt.Println("Redirect without Location header")
                return
            }

            // Resolve relative URLs
            redirectURL, err := url.Parse(location)
            if err != nil {
                fmt.Printf("Invalid redirect URL: %s\n", location)
                return
            }

            if !redirectURL.IsAbs() {
                redirectURL = r.Request.URL.ResolveReference(redirectURL)
            }

            finalURL := redirectURL.String()
            fmt.Printf("Redirect %d: %s -> %s\n", tracker.redirects+1, currentURL, finalURL)

            if tracker.ShouldFollow(currentURL, finalURL) {
                tracker.redirects++
                err := c.Visit(finalURL)
                if err != nil {
                    fmt.Printf("Failed to follow redirect: %v\n", err)
                }
            }
        } else {
            fmt.Printf("Final destination: %s (Status: %d)\n", currentURL, r.StatusCode)
            // Process the final page content here
            fmt.Printf("Content length: %d bytes\n", len(r.Body))
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Printf("Request failed: %s - %v\n", r.Request.URL, err)
    })

    c.Visit("https://httpbin.org/redirect/3")
}

4. Tracking Redirect Chain

Monitor the complete redirect path for debugging or analytics:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "net/http"
)

func main() {
    c := colly.NewCollector()
    var redirectChain []string

    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        // Record each step in the redirect chain
        redirectChain = append(redirectChain, req.URL.String())

        if len(via) >= 10 {
            return fmt.Errorf("stopped after 10 redirects")
        }
        return nil
    })

    c.OnRequest(func(r *colly.Request) {
        // Reset chain for new requests
        redirectChain = []string{r.URL.String()}
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Println("Redirect chain:")
        for i, url := range redirectChain {
            fmt.Printf("  %d. %s\n", i+1, url)
        }
        fmt.Printf("Final status: %d\n", r.StatusCode)
    })

    c.Visit("https://httpbin.org/redirect/4")
}

Best Practices

  1. Set reasonable redirect limits to prevent infinite loops and excessive resource usage
  2. Handle relative URLs properly by resolving them against the current URL
  3. Track visited URLs to detect and prevent redirect loops
  4. Log redirect chains for debugging and monitoring
  5. Handle edge cases like missing Location headers or malformed URLs
  6. Consider performance when following many redirects in high-volume scraping

Common Use Cases

  • SEO analysis: Track redirect chains to identify broken or excessive redirects
  • Content scraping: Follow redirects to reach the final content URL
  • Link validation: Verify that URLs resolve to expected destinations
  • Security scanning: Detect suspicious redirect patterns that might indicate attacks

By implementing proper redirect handling, you can build more robust and reliable web scrapers that handle real-world scenarios effectively.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon