How do I handle cookies within a Colly session?

Cookie handling in Colly is essential for scraping websites that require authentication, maintain session state, or track user behavior. Colly provides both automatic cookie management and manual control when needed.

Automatic Cookie Management

Colly automatically handles cookies for each domain by default. When the server sets cookies in responses, Colly stores them and includes them in subsequent requests to the same domain.

package main

import (
    "fmt"
    "github.com/gocolly/colly"
    "net/http"
)

func main() {
    c := colly.NewCollector(
        colly.AllowedDomains("httpbin.org"),
    )

    c.OnHTML("body", func(e *colly.HTMLElement) {
        fmt.Println("Page content scraped with cookies automatically handled")
    })

    // Colly will automatically handle any Set-Cookie headers
    c.Visit("https://httpbin.org/cookies/set/session_id/12345")

    // Subsequent requests will include the session_id cookie
    c.Visit("https://httpbin.org/cookies")
}

Manual Cookie Setting

Setting Cookies Before Requests

You can manually set cookies before making requests using the OnRequest callback:

func main() {
    c := colly.NewCollector()

    c.OnRequest(func(r *colly.Request) {
        // Method 1: Set cookies directly on the collector
        cookies := []*http.Cookie{
            {
                Name:   "auth_token",
                Value:  "abc123xyz",
                Domain: r.URL.Host,
                Path:   "/",
            },
            {
                Name:   "user_id",
                Value:  "user_12345",
                Domain: r.URL.Host,
                Path:   "/",
            },
        }
        c.SetCookies(r.URL.String(), cookies)

        // Method 2: Set cookie header manually
        r.Headers.Set("Cookie", "session=active; preferences=dark_mode")
    })

    c.Visit("https://example.com/protected")
}

Reading Cookies from Responses

Access cookies that were set by the server in the response:

func main() {
    c := colly.NewCollector()

    c.OnResponse(func(r *colly.Response) {
        // Get all cookies for this URL
        cookies := c.Cookies(r.Request.URL.String())

        for _, cookie := range cookies {
            fmt.Printf("Cookie: %s = %s (Domain: %s, Path: %s)\n", 
                cookie.Name, cookie.Value, cookie.Domain, cookie.Path)
        }
    })

    c.Visit("https://httpbin.org/cookies/set/test/value")
}

Custom Cookie Jar

For advanced cookie management, you can provide a custom cookie jar:

import (
    "net/http/cookiejar"
    "net/url"
)

func main() {
    // Create a custom cookie jar with options
    jar, err := cookiejar.New(&cookiejar.Options{
        PublicSuffixList: publicsuffix.List,
    })
    if err != nil {
        panic(err)
    }

    c := colly.NewCollector()
    c.SetCookieJar(jar)

    // Pre-populate the jar with cookies
    u, _ := url.Parse("https://example.com")
    jar.SetCookies(u, []*http.Cookie{
        {Name: "preloaded", Value: "cookie_value"},
    })

    c.Visit("https://example.com")
}

Login Session Example

Here's a practical example of handling login sessions with cookies:

func main() {
    c := colly.NewCollector()

    // Step 1: Visit login page and extract CSRF token
    var csrfToken string
    c.OnHTML("input[name='csrf_token']", func(e *colly.HTMLElement) {
        csrfToken = e.Attr("value")
    })

    c.Visit("https://example.com/login")

    // Step 2: Submit login form with credentials
    c.OnRequest(func(r *colly.Request) {
        if r.URL.Path == "/login" && r.Method == "POST" {
            r.Headers.Set("Content-Type", "application/x-www-form-urlencoded")
        }
    })

    // Login request - cookies will be automatically saved
    err := c.Post("https://example.com/login", map[string]string{
        "username":   "your_username",
        "password":   "your_password",
        "csrf_token": csrfToken,
    })
    if err != nil {
        panic(err)
    }

    // Step 3: Access protected pages (cookies automatically included)
    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Printf("Protected page title: %s\n", e.Text)
    })

    c.Visit("https://example.com/dashboard")
}

Persisting Cookies

To save cookies between program runs, implement cookie persistence:

import (
    "encoding/json"
    "io/ioutil"
    "net/http"
    "net/url"
    "time"
)

// SaveCookies saves cookies to a JSON file
func SaveCookies(jar http.CookieJar, filename string, u *url.URL) error {
    cookies := jar.Cookies(u)

    // Convert to a serializable format
    type CookieData struct {
        Name    string    `json:"name"`
        Value   string    `json:"value"`
        Domain  string    `json:"domain"`
        Path    string    `json:"path"`
        Expires time.Time `json:"expires"`
        Secure  bool      `json:"secure"`
    }

    var cookieData []CookieData
    for _, cookie := range cookies {
        cookieData = append(cookieData, CookieData{
            Name:    cookie.Name,
            Value:   cookie.Value,
            Domain:  cookie.Domain,
            Path:    cookie.Path,
            Expires: cookie.Expires,
            Secure:  cookie.Secure,
        })
    }

    data, err := json.Marshal(cookieData)
    if err != nil {
        return err
    }

    return ioutil.WriteFile(filename, data, 0644)
}

// LoadCookies loads cookies from a JSON file
func LoadCookies(jar http.CookieJar, filename string, u *url.URL) error {
    data, err := ioutil.ReadFile(filename)
    if err != nil {
        return err
    }

    type CookieData struct {
        Name    string    `json:"name"`
        Value   string    `json:"value"`
        Domain  string    `json:"domain"`
        Path    string    `json:"path"`
        Expires time.Time `json:"expires"`
        Secure  bool      `json:"secure"`
    }

    var cookieData []CookieData
    if err := json.Unmarshal(data, &cookieData); err != nil {
        return err
    }

    var cookies []*http.Cookie
    for _, cd := range cookieData {
        cookies = append(cookies, &http.Cookie{
            Name:    cd.Name,
            Value:   cd.Value,
            Domain:  cd.Domain,
            Path:    cd.Path,
            Expires: cd.Expires,
            Secure:  cd.Secure,
        })
    }

    jar.SetCookies(u, cookies)
    return nil
}

func main() {
    jar, _ := cookiejar.New(nil)
    c := colly.NewCollector()
    c.SetCookieJar(jar)

    u, _ := url.Parse("https://example.com")

    // Load existing cookies
    LoadCookies(jar, "cookies.json", u)

    // Perform scraping
    c.Visit("https://example.com")

    // Save cookies for next time
    SaveCookies(jar, "cookies.json", u)
}

Best Practices

  1. Always respect robots.txt and website terms of service
  2. Use rate limiting to avoid overwhelming servers
  3. Handle cookie expiration gracefully in persistent scenarios
  4. Secure cookie storage when persisting sensitive session data
  5. Clear cookies when switching between different user sessions

Common Issues

  • Domain mismatch: Ensure cookie domains match the request domain
  • Path restrictions: Cookies are only sent to matching paths
  • Secure cookies: HTTPS-only cookies won't work with HTTP requests
  • Expired cookies: Check expiration dates when loading persisted cookies

Cookie handling in Colly is powerful and flexible, supporting both simple automatic management and complex authentication scenarios. Choose the approach that best fits your scraping requirements.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon