Table of contents

How do I debug Colly scrapers and log requests?

Debugging Colly scrapers is essential for identifying issues, monitoring performance, and ensuring your web scraping projects run smoothly. Colly provides several built-in debugging features and allows for custom logging implementations to help you track requests, responses, and potential errors.

Built-in Debugging Features

Enabling Debug Mode

Colly offers a simple way to enable debug mode, which automatically logs all requests and responses:

package main

import (
    "fmt"
    "log"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
)

func main() {
    c := colly.NewCollector(
        colly.Debugger(&debug.LogDebugger{}),
    )

    c.OnHTML("title", func(e *colly.HTMLElement) {
        fmt.Println("Title:", e.Text)
    })

    c.OnRequest(func(r *colly.Request) {
        fmt.Println("Visiting", r.URL)
    })

    c.Visit("https://example.com")
}

Custom Debug Logger

For more control over debug output, you can implement a custom debugger:

package main

import (
    "fmt"
    "log"
    "os"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
)

type CustomDebugger struct {
    logger *log.Logger
}

func (d *CustomDebugger) Init() error {
    d.logger = log.New(os.Stdout, "[COLLY DEBUG] ", log.LstdFlags)
    return nil
}

func (d *CustomDebugger) Event(e *debug.Event) {
    d.logger.Printf("Type: %s, RequestID: %d, URL: %s", 
        e.Type, e.RequestID, e.Request.URL)
}

func main() {
    c := colly.NewCollector(
        colly.Debugger(&CustomDebugger{}),
    )

    // Your scraping logic here
    c.Visit("https://example.com")
}

Request and Response Logging

Comprehensive Request Logging

Track all aspects of HTTP requests including headers, timing, and response codes:

package main

import (
    "fmt"
    "log"
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Log before request is sent
    c.OnRequest(func(r *colly.Request) {
        log.Printf("Requesting: %s", r.URL)
        log.Printf("Method: %s", r.Method)
        log.Printf("Headers: %v", r.Headers)

        // Add timestamp to context for duration calculation
        r.Ctx.Put("start_time", time.Now())
    })

    // Log response details
    c.OnResponse(func(r *colly.Response) {
        startTime := r.Ctx.GetAny("start_time").(time.Time)
        duration := time.Since(startTime)

        log.Printf("Response from %s:", r.Request.URL)
        log.Printf("Status Code: %d", r.StatusCode)
        log.Printf("Content Length: %d bytes", len(r.Body))
        log.Printf("Duration: %v", duration)
        log.Printf("Response Headers: %v", r.Headers)
    })

    // Log errors
    c.OnError(func(r *colly.Response, err error) {
        log.Printf("Error on %s: %v", r.Request.URL, err)
        log.Printf("Status Code: %d", r.StatusCode)
    })

    c.Visit("https://example.com")
}

File-based Logging

Store logs in files for later analysis:

package main

import (
    "encoding/json"
    "log"
    "os"
    "time"

    "github.com/gocolly/colly/v2"
)

type RequestLog struct {
    URL        string            `json:"url"`
    Method     string            `json:"method"`
    StatusCode int               `json:"status_code"`
    Duration   time.Duration     `json:"duration"`
    Error      string            `json:"error,omitempty"`
    Timestamp  time.Time         `json:"timestamp"`
    Headers    map[string]string `json:"headers"`
}

func main() {
    // Create log file
    logFile, err := os.OpenFile("colly_requests.log", 
        os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
    if err != nil {
        log.Fatal(err)
    }
    defer logFile.Close()

    logger := log.New(logFile, "", 0)
    c := colly.NewCollector()

    c.OnRequest(func(r *colly.Request) {
        r.Ctx.Put("start_time", time.Now())
    })

    c.OnResponse(func(r *colly.Response) {
        startTime := r.Ctx.GetAny("start_time").(time.Time)

        requestLog := RequestLog{
            URL:        r.Request.URL.String(),
            Method:     r.Request.Method,
            StatusCode: r.StatusCode,
            Duration:   time.Since(startTime),
            Timestamp:  time.Now(),
            Headers:    make(map[string]string),
        }

        // Convert headers to map
        for key, values := range r.Headers {
            if len(values) > 0 {
                requestLog.Headers[key] = values[0]
            }
        }

        logData, _ := json.Marshal(requestLog)
        logger.Println(string(logData))
    })

    c.OnError(func(r *colly.Response, err error) {
        startTime := r.Ctx.GetAny("start_time").(time.Time)

        requestLog := RequestLog{
            URL:       r.Request.URL.String(),
            Method:    r.Request.Method,
            Duration:  time.Since(startTime),
            Error:     err.Error(),
            Timestamp: time.Now(),
        }

        if r != nil {
            requestLog.StatusCode = r.StatusCode
        }

        logData, _ := json.Marshal(requestLog)
        logger.Println(string(logData))
    })

    c.Visit("https://example.com")
}

Advanced Debugging Techniques

Network Request Monitoring

Monitor network-level details including DNS resolution and connection times:

package main

import (
    "crypto/tls"
    "fmt"
    "net/http"
    "net/http/httptrace"
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Custom transport with detailed tracing
    transport := &http.Transport{
        TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
    }

    c.OnRequest(func(r *colly.Request) {
        // Add detailed tracing to the request
        trace := &httptrace.ClientTrace{
            DNSStart: func(info httptrace.DNSStartInfo) {
                fmt.Printf("DNS lookup started for %s\n", info.Host)
            },
            DNSDone: func(info httptrace.DNSDoneInfo) {
                fmt.Printf("DNS lookup completed: %v\n", info.Addrs)
            },
            ConnectStart: func(network, addr string) {
                fmt.Printf("Connection started to %s\n", addr)
            },
            ConnectDone: func(network, addr string, err error) {
                if err != nil {
                    fmt.Printf("Connection failed to %s: %v\n", addr, err)
                } else {
                    fmt.Printf("Connection established to %s\n", addr)
                }
            },
            TLSHandshakeStart: func() {
                fmt.Println("TLS handshake started")
            },
            TLSHandshakeDone: func(state tls.ConnectionState, err error) {
                if err != nil {
                    fmt.Printf("TLS handshake failed: %v\n", err)
                } else {
                    fmt.Println("TLS handshake completed")
                }
            },
        }

        r.Headers.Set("User-Agent", "Colly Debug Bot")
        ctx := httptrace.WithClientTrace(r.Ctx, trace)
        r.Ctx = ctx
    })

    c.OnTransport(transport)
    c.Visit("https://example.com")
}

Memory and Performance Monitoring

Track memory usage and performance metrics:

package main

import (
    "fmt"
    "runtime"
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    var requestCount int
    startTime := time.Now()

    c.OnRequest(func(r *colly.Request) {
        requestCount++

        var m runtime.MemStats
        runtime.ReadMemStats(&m)

        fmt.Printf("Request #%d to %s\n", requestCount, r.URL)
        fmt.Printf("Memory Usage: %.2f MB\n", 
            float64(m.Alloc)/1024/1024)
        fmt.Printf("Goroutines: %d\n", runtime.NumGoroutine())
    })

    c.OnResponse(func(r *colly.Response) {
        elapsed := time.Since(startTime)
        rate := float64(requestCount) / elapsed.Seconds()

        fmt.Printf("Total requests: %d\n", requestCount)
        fmt.Printf("Rate: %.2f requests/second\n", rate)
        fmt.Printf("Total time: %v\n", elapsed)
    })

    c.Visit("https://example.com")
}

Debugging Common Issues

Handling Rate Limiting and Retries

Debug rate limiting issues and implement retry logic:

package main

import (
    "fmt"
    "log"
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Add rate limiting
    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 2,
        Delay:       1 * time.Second,
    })

    c.OnRequest(func(r *colly.Request) {
        log.Printf("Requesting: %s", r.URL)

        // Add retry counter to context
        retryCount := r.Ctx.GetAny("retry_count")
        if retryCount == nil {
            r.Ctx.Put("retry_count", 0)
        }
    })

    c.OnError(func(r *colly.Response, err error) {
        retryCount := r.Ctx.GetAny("retry_count").(int)
        maxRetries := 3

        log.Printf("Error on %s (attempt %d): %v", 
            r.Request.URL, retryCount+1, err)

        if retryCount < maxRetries {
            log.Printf("Retrying request to %s", r.Request.URL)
            r.Request.Ctx.Put("retry_count", retryCount+1)

            // Wait before retry
            time.Sleep(time.Duration(retryCount+1) * time.Second)
            r.Request.Retry()
        } else {
            log.Printf("Max retries exceeded for %s", r.Request.URL)
        }
    })

    c.Visit("https://example.com")
}

Cookie and Session Debugging

Debug cookie handling and session management:

package main

import (
    "fmt"
    "net/http"
    "net/http/cookiejar"
    "net/url"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    // Enable cookie jar
    jar, _ := cookiejar.New(nil)
    c.SetCookieJar(jar)

    c.OnRequest(func(r *colly.Request) {
        fmt.Printf("Request to: %s\n", r.URL)

        // Log cookies being sent
        if cookies := jar.Cookies(r.URL); len(cookies) > 0 {
            fmt.Println("Sending cookies:")
            for _, cookie := range cookies {
                fmt.Printf("  %s=%s\n", cookie.Name, cookie.Value)
            }
        }
    })

    c.OnResponse(func(r *colly.Response) {
        fmt.Printf("Response from: %s\n", r.Request.URL)

        // Log cookies received
        if cookies := r.Headers["Set-Cookie"]; len(cookies) > 0 {
            fmt.Println("Received cookies:")
            for _, cookie := range cookies {
                fmt.Printf("  %s\n", cookie)
            }
        }

        // Show current cookie jar state
        u, _ := url.Parse(r.Request.URL.String())
        if jarCookies := jar.Cookies(u); len(jarCookies) > 0 {
            fmt.Println("Current cookie jar:")
            for _, cookie := range jarCookies {
                fmt.Printf("  %s=%s (expires: %s)\n", 
                    cookie.Name, cookie.Value, cookie.Expires)
            }
        }
    })

    c.Visit("https://example.com")
}

Integration with External Tools

Structured Logging with Logrus

Use structured logging for better log analysis:

package main

import (
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/sirupsen/logrus"
)

func main() {
    // Configure logrus
    logger := logrus.New()
    logger.SetFormatter(&logrus.JSONFormatter{})

    c := colly.NewCollector()

    c.OnRequest(func(r *colly.Request) {
        logger.WithFields(logrus.Fields{
            "event":  "request_start",
            "url":    r.URL.String(),
            "method": r.Method,
        }).Info("Starting request")

        r.Ctx.Put("start_time", time.Now())
    })

    c.OnResponse(func(r *colly.Response) {
        startTime := r.Ctx.GetAny("start_time").(time.Time)

        logger.WithFields(logrus.Fields{
            "event":         "request_complete",
            "url":           r.Request.URL.String(),
            "status_code":   r.StatusCode,
            "content_length": len(r.Body),
            "duration_ms":   time.Since(startTime).Milliseconds(),
        }).Info("Request completed")
    })

    c.OnError(func(r *colly.Response, err error) {
        logger.WithFields(logrus.Fields{
            "event": "request_error",
            "url":   r.Request.URL.String(),
            "error": err.Error(),
        }).Error("Request failed")
    })

    c.Visit("https://example.com")
}

Best Practices for Debugging

Environment-based Logging

Implement different logging levels for development and production:

package main

import (
    "os"
    "strings"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/debug"
    "github.com/sirupsen/logrus"
)

func setupCollector() *colly.Collector {
    var c *colly.Collector

    env := strings.ToLower(os.Getenv("ENVIRONMENT"))

    if env == "development" || env == "debug" {
        // Enable verbose debugging in development
        c = colly.NewCollector(
            colly.Debugger(&debug.LogDebugger{}),
        )

        logrus.SetLevel(logrus.DebugLevel)
    } else {
        // Production setup with minimal logging
        c = colly.NewCollector()
        logrus.SetLevel(logrus.WarnLevel)
    }

    return c
}

func main() {
    c := setupCollector()

    // Add your scraping logic
    c.OnHTML("title", func(e *colly.HTMLElement) {
        logrus.Info("Found title: ", e.Text)
    })

    c.Visit("https://example.com")
}

Debugging Colly scrapers effectively requires a combination of built-in features, custom logging, and monitoring techniques. By implementing comprehensive logging and debugging strategies, you can identify issues quickly, optimize performance, and ensure reliable web scraping operations. Similar to how you monitor network requests in Puppeteer, monitoring HTTP requests in Colly provides valuable insights into your scraper's behavior and helps troubleshoot connectivity issues.

Remember to adjust your debugging level based on your environment and always implement proper error handling to make your scrapers more robust and maintainable.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon