Managing cookies is essential for maintaining sessions during web scraping in Go. Cookies enable you to handle login states, track user sessions, and maintain context across multiple requests. Go provides built-in cookie management through the http.Client
and http.CookieJar
interfaces.
Basic Cookie Management Setup
Creating a Cookie Jar
Go's net/http/cookiejar
package provides a standard implementation of the http.CookieJar
interface:
package main
import (
"fmt"
"log"
"net/http"
"net/http/cookiejar"
"net/url"
"golang.org/x/net/publicsuffix"
)
func createClient() *http.Client {
// Create a cookie jar with public suffix list for proper domain handling
jar, err := cookiejar.New(&cookiejar.Options{
PublicSuffixList: publicsuffix.List,
})
if err != nil {
log.Fatal("Failed to create cookie jar:", err)
}
return &http.Client{
Jar: jar,
Timeout: 30 * time.Second, // Add timeout for production use
}
}
Basic Cookie-Enabled Scraping
func basicCookieExample() {
client := createClient()
// First request - cookies will be stored automatically
resp, err := client.Get("https://httpbin.org/cookies/set/session_id/abc123")
if err != nil {
log.Fatal("Request failed:", err)
}
resp.Body.Close()
// Second request - cookies will be sent automatically
resp, err = client.Get("https://httpbin.org/cookies")
if err != nil {
log.Fatal("Request failed:", err)
}
defer resp.Body.Close()
// The session_id cookie will be included in the second request
fmt.Println("Status:", resp.Status)
}
Advanced Cookie Operations
Inspecting and Modifying Cookies
func inspectAndModifyCookies() {
client := createClient()
targetURL, _ := url.Parse("https://example.com")
// Make initial request to get cookies
resp, err := client.Get(targetURL.String())
if err != nil {
log.Fatal("Request failed:", err)
}
resp.Body.Close()
// Inspect stored cookies
cookies := client.Jar.Cookies(targetURL)
fmt.Printf("Found %d cookies:\n", len(cookies))
for _, cookie := range cookies {
fmt.Printf(" %s = %s (Domain: %s, Path: %s)\n",
cookie.Name, cookie.Value, cookie.Domain, cookie.Path)
}
// Add custom cookies
customCookies := []*http.Cookie{
{
Name: "user_preference",
Value: "dark_mode",
Domain: targetURL.Host,
Path: "/",
HttpOnly: false,
Secure: true,
},
{
Name: "tracking_id",
Value: "user_12345",
Domain: targetURL.Host,
Path: "/",
},
}
// Set additional cookies
client.Jar.SetCookies(targetURL, customCookies)
// Verify cookies were added
allCookies := client.Jar.Cookies(targetURL)
fmt.Printf("Total cookies after adding custom ones: %d\n", len(allCookies))
}
Manual Cookie Creation and Management
func manualCookieManagement() {
client := createClient()
targetURL, _ := url.Parse("https://api.example.com")
// Create cookies manually with specific attributes
cookies := []*http.Cookie{
{
Name: "api_token",
Value: "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
Domain: targetURL.Host,
Path: "/api",
Expires: time.Now().Add(24 * time.Hour),
HttpOnly: true,
Secure: true,
SameSite: http.SameSiteStrictMode,
},
{
Name: "csrf_token",
Value: "random_csrf_token_here",
Domain: targetURL.Host,
Path: "/",
},
}
// Set cookies before making requests
client.Jar.SetCookies(targetURL, cookies)
// Make authenticated request
req, _ := http.NewRequest("GET", targetURL.String()+"/user/profile", nil)
resp, err := client.Do(req)
if err != nil {
log.Fatal("Request failed:", err)
}
defer resp.Body.Close()
fmt.Println("Authenticated request status:", resp.Status)
}
Login Session Management
Handling Login with Cookie Persistence
func loginWithCookies() {
client := createClient()
// Step 1: Get login page (may set CSRF tokens)
loginPageURL := "https://example.com/login"
resp, err := client.Get(loginPageURL)
if err != nil {
log.Fatal("Failed to get login page:", err)
}
resp.Body.Close()
// Step 2: Parse login form if needed (extract CSRF tokens, etc.)
// ... form parsing logic ...
// Step 3: Submit login credentials
loginData := url.Values{
"username": {"your_username"},
"password": {"your_password"},
"csrf_token": {"extracted_csrf_token"}, // if required
}
resp, err = client.PostForm("https://example.com/login", loginData)
if err != nil {
log.Fatal("Login failed:", err)
}
resp.Body.Close()
// Check if login was successful
if resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusFound {
fmt.Println("Login successful!")
// Step 4: Access protected resources
protectedResp, err := client.Get("https://example.com/dashboard")
if err != nil {
log.Fatal("Failed to access protected resource:", err)
}
defer protectedResp.Body.Close()
fmt.Println("Protected resource status:", protectedResp.Status)
// Session cookies are automatically maintained by the client
} else {
fmt.Printf("Login failed with status: %s\n", resp.Status)
}
}
Error Handling and Best Practices
Robust Cookie Management with Error Handling
func robustCookieManagement() error {
// Create client with proper timeout and retry logic
jar, err := cookiejar.New(&cookiejar.Options{
PublicSuffixList: publicsuffix.List,
})
if err != nil {
return fmt.Errorf("failed to create cookie jar: %w", err)
}
client := &http.Client{
Jar: jar,
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
// Limit redirects to prevent infinite loops
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
targetURL, err := url.Parse("https://example.com")
if err != nil {
return fmt.Errorf("invalid URL: %w", err)
}
// Make request with error handling
resp, err := client.Get(targetURL.String())
if err != nil {
return fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
// Check response status
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// Validate cookies
cookies := jar.Cookies(targetURL)
if len(cookies) == 0 {
log.Println("Warning: No cookies received")
}
for _, cookie := range cookies {
// Validate critical cookies
if cookie.Name == "session_id" && cookie.Value == "" {
return fmt.Errorf("empty session ID cookie")
}
}
return nil
}
Performance and Memory Considerations
Cookie Jar Cleanup
func cookieCleanup() {
client := createClient()
targetURL, _ := url.Parse("https://example.com")
// Perform scraping operations...
// Clear cookies for specific domain when done
client.Jar.SetCookies(targetURL, []*http.Cookie{})
// Or create a new client for fresh session
client = createClient()
}
Custom Cookie Jar Implementation
type CustomCookieJar struct {
jar http.CookieJar
mu sync.RWMutex
}
func (c *CustomCookieJar) SetCookies(u *url.URL, cookies []*http.Cookie) {
c.mu.Lock()
defer c.mu.Unlock()
// Add custom logic (filtering, logging, etc.)
var validCookies []*http.Cookie
for _, cookie := range cookies {
if cookie.Name != "" && cookie.Value != "" {
validCookies = append(validCookies, cookie)
}
}
c.jar.SetCookies(u, validCookies)
}
func (c *CustomCookieJar) Cookies(u *url.URL) []*http.Cookie {
c.mu.RLock()
defer c.mu.RUnlock()
return c.jar.Cookies(u)
}
Common Troubleshooting
Debug Cookie Issues
func debugCookies(client *http.Client, targetURL *url.URL) {
cookies := client.Jar.Cookies(targetURL)
fmt.Printf("=== Cookie Debug Info ===\n")
fmt.Printf("URL: %s\n", targetURL.String())
fmt.Printf("Cookie count: %d\n", len(cookies))
for i, cookie := range cookies {
fmt.Printf("Cookie %d:\n", i+1)
fmt.Printf(" Name: %s\n", cookie.Name)
fmt.Printf(" Value: %s\n", cookie.Value)
fmt.Printf(" Domain: %s\n", cookie.Domain)
fmt.Printf(" Path: %s\n", cookie.Path)
fmt.Printf(" Expires: %s\n", cookie.Expires.Format(time.RFC3339))
fmt.Printf(" HttpOnly: %t\n", cookie.HttpOnly)
fmt.Printf(" Secure: %t\n", cookie.Secure)
fmt.Printf(" SameSite: %v\n", cookie.SameSite)
fmt.Println()
}
}
Important Considerations
- Domain Matching: Cookies are domain-specific. Ensure your cookie jar properly handles subdomain matching using the
PublicSuffixList
. - HTTPS Requirements: Secure cookies will only be sent over HTTPS connections.
- Session Persistence: Cookies are stored in memory by default. For persistent sessions across program restarts, implement custom cookie storage.
- Rate Limiting: Some websites track session activity. Implement delays between requests to avoid detection.
- Legal Compliance: Always respect robots.txt, terms of service, and applicable data protection laws when scraping websites.
Cookie management is crucial for effective web scraping in Go, especially when dealing with authenticated sessions or stateful web applications. The built-in http.CookieJar
provides robust functionality for most use cases, while custom implementations offer flexibility for specialized requirements.