How do I handle redirects in Colly?

Colly is a powerful web scraping framework for Go that provides built-in redirect handling capabilities. By default, Colly automatically follows up to 10 redirects, but you can customize this behavior for different scraping scenarios.

Default Redirect Behavior

Colly handles redirects automatically following standard HTTP redirect status codes (301, 302, 303, 307, 308). Here's a basic example:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting:", r.URL)
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Printf("Final URL: %s (Status: %d)\n", r.Request.URL, r.StatusCode)
    })

    c.Visit("https://httpbin.org/redirect/3")
}

1. Configure Maximum Redirects

Control the number of redirects Colly will follow to prevent excessive redirections:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
)

func main() {
    // Limit redirects to 3
    c := colly.NewCollector()
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 2,
    })

    // Set maximum redirects
    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        if len(via) >= 3 {
            return fmt.Errorf("too many redirects")
        }
        return nil
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Requesting:", r.URL)
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Printf("Success: %s\n", r.Request.URL)
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Printf("Error: %s - %v\n", r.Request.URL, err)
    })

    c.Visit("https://httpbin.org/redirect/5") // Will fail after 3 redirects
}

2. Disable Redirect Following

To capture redirect responses without following them:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "net/http"
)

func main() {
    c := colly.NewCollector()

    // Disable automatic redirects
    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        return http.ErrUseLastResponse
    })

    c.OnResponse(func(r *colly.Response) {
        if r.StatusCode >= 300 && r.StatusCode < 400 {
            location := r.Headers.Get("Location")
            fmt.Printf("Redirect detected: %d -> %s\n", r.StatusCode, location)
        } else {
            fmt.Printf("Final response: %d\n", r.StatusCode)
        }
    })

    c.Visit("https://httpbin.org/redirect/1")
}

3. Manual Redirect Handling with Loop Prevention

For complete control over redirects with safety mechanisms:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "net/http"
    "net/url"
    "strings"
)

type RedirectTracker struct {
    visited     map[string]bool
    redirects   int
    maxRedirects int
}

func NewRedirectTracker(maxRedirects int) *RedirectTracker {
    return &RedirectTracker{
        visited:     make(map[string]bool),
        maxRedirects: maxRedirects,
    }
}

func (rt *RedirectTracker) ShouldFollow(currentURL, redirectURL string) bool {
    // Check for infinite loops
    if rt.visited[redirectURL] {
        fmt.Printf("Loop detected: %s already visited\n", redirectURL)
        return false
    }

    // Check redirect limit
    if rt.redirects >= rt.maxRedirects {
        fmt.Printf("Max redirects (%d) reached\n", rt.maxRedirects)
        return false
    }

    return true
}

func main() {
    c := colly.NewCollector()
    tracker := NewRedirectTracker(5)

    // Disable automatic redirects
    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        return http.ErrUseLastResponse
    })

    c.OnResponse(func(r *colly.Response) {
        currentURL := r.Request.URL.String()
        tracker.visited[currentURL] = true

        if r.StatusCode >= 300 && r.StatusCode < 400 {
            location := r.Headers.Get("Location")
            if location == "" {
                fmt.Println("Redirect without Location header")
                return
            }

            // Resolve relative URLs
            redirectURL, err := url.Parse(location)
            if err != nil {
                fmt.Printf("Invalid redirect URL: %s\n", location)
                return
            }

            if !redirectURL.IsAbs() {
                redirectURL = r.Request.URL.ResolveReference(redirectURL)
            }

            finalURL := redirectURL.String()
            fmt.Printf("Redirect %d: %s -> %s\n", tracker.redirects+1, currentURL, finalURL)

            if tracker.ShouldFollow(currentURL, finalURL) {
                tracker.redirects++
                err := c.Visit(finalURL)
                if err != nil {
                    fmt.Printf("Failed to follow redirect: %v\n", err)
                }
            }
        } else {
            fmt.Printf("Final destination: %s (Status: %d)\n", currentURL, r.StatusCode)
            // Process the final page content here
            fmt.Printf("Content length: %d bytes\n", len(r.Body))
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Printf("Request failed: %s - %v\n", r.Request.URL, err)
    })

    c.Visit("https://httpbin.org/redirect/3")
}

4. Tracking Redirect Chain

Monitor the complete redirect path for debugging or analytics:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "net/http"
)

func main() {
    c := colly.NewCollector()
    var redirectChain []string

    c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
        // Record each step in the redirect chain
        redirectChain = append(redirectChain, req.URL.String())

        if len(via) >= 10 {
            return fmt.Errorf("stopped after 10 redirects")
        }
        return nil
    })

    c.OnRequest(func(r *colly.Request) {
        // Reset chain for new requests
        redirectChain = []string{r.URL.String()}
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Println("Redirect chain:")
        for i, url := range redirectChain {
            fmt.Printf("  %d. %s\n", i+1, url)
        }
        fmt.Printf("Final status: %d\n", r.StatusCode)
    })

    c.Visit("https://httpbin.org/redirect/4")
}

Best Practices

Set reasonable redirect limits to prevent infinite loops and excessive resource usage
Handle relative URLs properly by resolving them against the current URL
Track visited URLs to detect and prevent redirect loops
Log redirect chains for debugging and monitoring
Handle edge cases like missing Location headers or malformed URLs
Consider performance when following many redirects in high-volume scraping

Common Use Cases

SEO analysis: Track redirect chains to identify broken or excessive redirects
Content scraping: Follow redirects to reach the final content URL
Link validation: Verify that URLs resolve to expected destinations
Security scanning: Detect suspicious redirect patterns that might indicate attacks

By implementing proper redirect handling, you can build more robust and reliable web scrapers that handle real-world scenarios effectively.

Table of contents

How do I handle redirects in Colly?

Default Redirect Behavior

1. Configure Maximum Redirects

2. Disable Redirect Following

3. Manual Redirect Handling with Loop Prevention

4. Tracking Redirect Chain

Best Practices

Common Use Cases

Try WebScraping.AI for Your Web Scraping Needs

Key Features:

Getting Started:

Related Questions

How do I use Colly's callback functions effectively?

Can Colly work with headless browsers for JavaScript rendering?

How do I set up a user agent string in Colly?

Get Started Now

Support

Support