Colly is a powerful web scraping framework for Go that provides built-in redirect handling capabilities. By default, Colly automatically follows up to 10 redirects, but you can customize this behavior for different scraping scenarios.
Default Redirect Behavior
Colly handles redirects automatically following standard HTTP redirect status codes (301, 302, 303, 307, 308). Here's a basic example:
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting:", r.URL)
})
c.OnResponse(func(r *colly.Response) {
fmt.Printf("Final URL: %s (Status: %d)\n", r.Request.URL, r.StatusCode)
})
c.Visit("https://httpbin.org/redirect/3")
}
1. Configure Maximum Redirects
Control the number of redirects Colly will follow to prevent excessive redirections:
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Limit redirects to 3
c := colly.NewCollector()
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 2,
})
// Set maximum redirects
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
if len(via) >= 3 {
return fmt.Errorf("too many redirects")
}
return nil
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Requesting:", r.URL)
})
c.OnResponse(func(r *colly.Response) {
fmt.Printf("Success: %s\n", r.Request.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Error: %s - %v\n", r.Request.URL, err)
})
c.Visit("https://httpbin.org/redirect/5") // Will fail after 3 redirects
}
2. Disable Redirect Following
To capture redirect responses without following them:
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"net/http"
)
func main() {
c := colly.NewCollector()
// Disable automatic redirects
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
})
c.OnResponse(func(r *colly.Response) {
if r.StatusCode >= 300 && r.StatusCode < 400 {
location := r.Headers.Get("Location")
fmt.Printf("Redirect detected: %d -> %s\n", r.StatusCode, location)
} else {
fmt.Printf("Final response: %d\n", r.StatusCode)
}
})
c.Visit("https://httpbin.org/redirect/1")
}
3. Manual Redirect Handling with Loop Prevention
For complete control over redirects with safety mechanisms:
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"net/http"
"net/url"
"strings"
)
type RedirectTracker struct {
visited map[string]bool
redirects int
maxRedirects int
}
func NewRedirectTracker(maxRedirects int) *RedirectTracker {
return &RedirectTracker{
visited: make(map[string]bool),
maxRedirects: maxRedirects,
}
}
func (rt *RedirectTracker) ShouldFollow(currentURL, redirectURL string) bool {
// Check for infinite loops
if rt.visited[redirectURL] {
fmt.Printf("Loop detected: %s already visited\n", redirectURL)
return false
}
// Check redirect limit
if rt.redirects >= rt.maxRedirects {
fmt.Printf("Max redirects (%d) reached\n", rt.maxRedirects)
return false
}
return true
}
func main() {
c := colly.NewCollector()
tracker := NewRedirectTracker(5)
// Disable automatic redirects
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
})
c.OnResponse(func(r *colly.Response) {
currentURL := r.Request.URL.String()
tracker.visited[currentURL] = true
if r.StatusCode >= 300 && r.StatusCode < 400 {
location := r.Headers.Get("Location")
if location == "" {
fmt.Println("Redirect without Location header")
return
}
// Resolve relative URLs
redirectURL, err := url.Parse(location)
if err != nil {
fmt.Printf("Invalid redirect URL: %s\n", location)
return
}
if !redirectURL.IsAbs() {
redirectURL = r.Request.URL.ResolveReference(redirectURL)
}
finalURL := redirectURL.String()
fmt.Printf("Redirect %d: %s -> %s\n", tracker.redirects+1, currentURL, finalURL)
if tracker.ShouldFollow(currentURL, finalURL) {
tracker.redirects++
err := c.Visit(finalURL)
if err != nil {
fmt.Printf("Failed to follow redirect: %v\n", err)
}
}
} else {
fmt.Printf("Final destination: %s (Status: %d)\n", currentURL, r.StatusCode)
// Process the final page content here
fmt.Printf("Content length: %d bytes\n", len(r.Body))
}
})
c.OnError(func(r *colly.Response, err error) {
fmt.Printf("Request failed: %s - %v\n", r.Request.URL, err)
})
c.Visit("https://httpbin.org/redirect/3")
}
4. Tracking Redirect Chain
Monitor the complete redirect path for debugging or analytics:
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"net/http"
)
func main() {
c := colly.NewCollector()
var redirectChain []string
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
// Record each step in the redirect chain
redirectChain = append(redirectChain, req.URL.String())
if len(via) >= 10 {
return fmt.Errorf("stopped after 10 redirects")
}
return nil
})
c.OnRequest(func(r *colly.Request) {
// Reset chain for new requests
redirectChain = []string{r.URL.String()}
})
c.OnResponse(func(r *colly.Response) {
fmt.Println("Redirect chain:")
for i, url := range redirectChain {
fmt.Printf(" %d. %s\n", i+1, url)
}
fmt.Printf("Final status: %d\n", r.StatusCode)
})
c.Visit("https://httpbin.org/redirect/4")
}
Best Practices
- Set reasonable redirect limits to prevent infinite loops and excessive resource usage
- Handle relative URLs properly by resolving them against the current URL
- Track visited URLs to detect and prevent redirect loops
- Log redirect chains for debugging and monitoring
- Handle edge cases like missing Location headers or malformed URLs
- Consider performance when following many redirects in high-volume scraping
Common Use Cases
- SEO analysis: Track redirect chains to identify broken or excessive redirects
- Content scraping: Follow redirects to reach the final content URL
- Link validation: Verify that URLs resolve to expected destinations
- Security scanning: Detect suspicious redirect patterns that might indicate attacks
By implementing proper redirect handling, you can build more robust and reliable web scrapers that handle real-world scenarios effectively.