Scraping content from behind a login wall with Colly requires simulating the authentication process that a real user would follow. This involves handling login forms, session management, and potentially dealing with security measures like CSRF tokens.
Understanding Authentication Flow
Before implementing login functionality, you need to analyze how the target website handles authentication:
- Form-based authentication - Most common, uses HTML forms with POST requests
- Token-based authentication - Uses CSRF tokens or other security measures
- Session management - Relies on cookies or session storage
- Multi-step authentication - May require captcha or two-factor authentication
Installation
First, ensure you have Colly installed:
go get -u github.com/gocolly/colly/v2
Basic Login Implementation
Here's a comprehensive example that handles form-based login:
package main
import (
"fmt"
"log"
"net/url"
"strings"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
// Create collector with user agent and debugging
c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
)
// Enable debugging (optional)
c.SetDebugger(&debug.LogDebugger{})
// Set up error handling
c.OnError(func(r *colly.Response, err error) {
log.Printf("Error: %s", err.Error())
})
// Variables to store form data
var csrfToken string
var loginURL string
// Step 1: Visit login page to get CSRF token
c.OnHTML("form[action*='login']", func(e *colly.HTMLElement) {
loginURL = e.Request.AbsoluteURL(e.Attr("action"))
csrfToken = e.ChildAttr("input[name='_token']", "value")
fmt.Printf("Found login form: %s\n", loginURL)
fmt.Printf("CSRF Token: %s\n", csrfToken)
})
// Visit the login page first
err := c.Visit("https://example.com/login")
if err != nil {
log.Fatal("Failed to visit login page:", err)
}
// Step 2: Submit login form
loginData := url.Values{
"email": {"your_email@example.com"},
"password": {"your_password"},
"_token": {csrfToken}, // Include CSRF token if present
"remember": {"1"}, // Optional remember me checkbox
}
// Perform login
err = c.Post(loginURL, loginData)
if err != nil {
log.Fatal("Login failed:", err)
}
// Step 3: Set up scrapers for protected content
c.OnHTML(".protected-content", func(e *colly.HTMLElement) {
fmt.Printf("Title: %s\n", e.ChildText("h1"))
fmt.Printf("Content: %s\n", e.ChildText(".content"))
// Extract links from protected area
e.ForEach("a[href]", func(_ int, el *colly.HTMLElement) {
link := el.Attr("href")
fmt.Printf("Found link: %s\n", el.Request.AbsoluteURL(link))
})
})
// Step 4: Visit protected pages
protectedURLs := []string{
"https://example.com/dashboard",
"https://example.com/profile",
"https://example.com/settings",
}
for _, url := range protectedURLs {
err = c.Visit(url)
if err != nil {
log.Printf("Failed to visit %s: %v", url, err)
}
}
}
Advanced Authentication Handling
Handling Different Login Scenarios
For more complex authentication, you may need to handle various scenarios:
func handleComplexLogin(c *colly.Collector) error {
var formData url.Values
var actionURL string
// Extract all form fields dynamically
c.OnHTML("form#login-form", func(e *colly.HTMLElement) {
actionURL = e.Request.AbsoluteURL(e.Attr("action"))
formData = url.Values{}
// Get all input fields
e.ForEach("input", func(_ int, input *colly.HTMLElement) {
name := input.Attr("name")
value := input.Attr("value")
inputType := input.Attr("type")
switch inputType {
case "email", "text":
if strings.Contains(name, "email") || strings.Contains(name, "username") {
formData.Set(name, "your_username")
}
case "password":
formData.Set(name, "your_password")
case "hidden":
// Include hidden fields (like CSRF tokens)
formData.Set(name, value)
case "checkbox":
if strings.Contains(name, "remember") {
formData.Set(name, "1")
}
}
})
})
// Visit login page first
err := c.Visit("https://example.com/login")
if err != nil {
return fmt.Errorf("failed to visit login page: %v", err)
}
// Submit login form
return c.Post(actionURL, formData)
}
Session Validation
Check if login was successful by looking for specific elements:
func validateLogin(c *colly.Collector) bool {
loginSuccess := false
// Check for logout link (indicates successful login)
c.OnHTML("a[href*='logout']", func(e *colly.HTMLElement) {
loginSuccess = true
fmt.Println("Login successful - found logout link")
})
// Check for error messages
c.OnHTML(".error, .alert-danger", func(e *colly.HTMLElement) {
fmt.Printf("Login error: %s\n", e.Text)
})
// Check for welcome message
c.OnHTML(".welcome, .user-info", func(e *colly.HTMLElement) {
fmt.Printf("Welcome message: %s\n", e.Text)
loginSuccess = true
})
return loginSuccess
}
Handling Special Cases
CSRF Protection
Many modern websites use CSRF tokens. Here's how to handle them:
func extractCSRFToken(c *colly.Collector, formSelector string) string {
var token string
c.OnHTML(formSelector, func(e *colly.HTMLElement) {
// Common CSRF token field names
tokenFields := []string{"_token", "csrf_token", "authenticity_token"}
for _, fieldName := range tokenFields {
if val := e.ChildAttr(fmt.Sprintf("input[name='%s']", fieldName), "value"); val != "" {
token = val
break
}
}
// Check meta tags for CSRF tokens
if token == "" {
token = e.ChildAttr("meta[name='csrf-token']", "content")
}
})
return token
}
Custom Headers and User Agents
Some sites require specific headers:
func setupHeaders(c *colly.Collector) {
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.5")
r.Headers.Set("Accept-Encoding", "gzip, deflate")
r.Headers.Set("Referer", "https://example.com/login")
})
}
Error Handling and Debugging
Implement robust error handling for authentication failures:
func setupErrorHandling(c *colly.Collector) {
// Handle HTTP errors
c.OnError(func(r *colly.Response, err error) {
log.Printf("Request failed: %s, Status: %d", err.Error(), r.StatusCode)
switch r.StatusCode {
case 401:
log.Println("Authentication failed - check credentials")
case 403:
log.Println("Access forbidden - check permissions")
case 429:
log.Println("Rate limited - implement delays")
}
})
// Log responses for debugging
c.OnResponse(func(r *colly.Response) {
fmt.Printf("Visited: %s, Status: %d\n", r.Request.URL.String(), r.StatusCode)
})
}
Rate Limiting and Delays
Implement delays to avoid being blocked:
import "time"
func setupRateLimiting(c *colly.Collector) {
// Add delays between requests
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 1,
Delay: 2 * time.Second,
})
// Random delay between requests
c.OnRequest(func(r *colly.Request) {
time.Sleep(time.Duration(rand.Intn(3)+1) * time.Second)
})
}
Complete Working Example
Here's a complete example that combines all best practices:
package main
import (
"fmt"
"log"
"net/url"
"time"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
type LoginScraper struct {
collector *colly.Collector
baseURL string
credentials map[string]string
}
func NewLoginScraper(baseURL string, email, password string) *LoginScraper {
c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
)
// Rate limiting
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 1,
Delay: 1 * time.Second,
})
return &LoginScraper{
collector: c,
baseURL: baseURL,
credentials: map[string]string{
"email": email,
"password": password,
},
}
}
func (ls *LoginScraper) Login() error {
var csrfToken string
var loginURL string
// Extract login form details
ls.collector.OnHTML("form", func(e *colly.HTMLElement) {
action := e.Attr("action")
if action != "" {
loginURL = e.Request.AbsoluteURL(action)
csrfToken = e.ChildAttr("input[name='_token']", "value")
}
})
// Visit login page
err := ls.collector.Visit(ls.baseURL + "/login")
if err != nil {
return fmt.Errorf("failed to visit login page: %v", err)
}
// Prepare login data
loginData := url.Values{
"email": {ls.credentials["email"]},
"password": {ls.credentials["password"]},
}
if csrfToken != "" {
loginData.Set("_token", csrfToken)
}
// Submit login
return ls.collector.Post(loginURL, loginData)
}
func (ls *LoginScraper) ScrapeProtectedContent(urls []string) {
ls.collector.OnHTML(".content", func(e *colly.HTMLElement) {
fmt.Printf("Page: %s\n", e.Request.URL.String())
fmt.Printf("Title: %s\n", e.ChildText("h1"))
fmt.Printf("Content: %s\n", e.Text)
fmt.Println(strings.Repeat("-", 50))
})
for _, url := range urls {
err := ls.collector.Visit(url)
if err != nil {
log.Printf("Failed to visit %s: %v", url, err)
}
}
}
func main() {
scraper := NewLoginScraper(
"https://example.com",
"your_email@example.com",
"your_password",
)
// Perform login
if err := scraper.Login(); err != nil {
log.Fatal("Login failed:", err)
}
// Scrape protected content
protectedURLs := []string{
"https://example.com/dashboard",
"https://example.com/profile",
}
scraper.ScrapeProtectedContent(protectedURLs)
}
Best Practices and Legal Considerations
- Respect robots.txt - Always check the site's robots.txt file
- Rate limiting - Implement delays to avoid overwhelming servers
- User agent - Use realistic user agent strings
- Session management - Handle cookies and sessions properly
- Error handling - Implement robust error handling and recovery
- Legal compliance - Ensure you have permission to scrape the content
- Terms of service - Respect the website's terms of service
Common Issues and Solutions
Issue: Login appears successful but protected pages return 401/403
Solution: Check if the site uses additional security measures like: - Two-factor authentication - Captcha verification - IP-based restrictions - Session timeouts
Issue: CSRF token validation fails
Solution: Ensure you're extracting the CSRF token from the correct form field and submitting it with the login request.
Issue: Cookies not persisting between requests
Solution: Colly handles cookies automatically, but you can manually manage them if needed:
c.OnResponse(func(r *colly.Response) {
// Save cookies for later use
cookies := r.Headers.Get("Set-Cookie")
fmt.Printf("Cookies: %s\n", cookies)
})
Remember to always scrape responsibly and ethically, respecting website terms of service and applicable laws.