How do I debug Colly scrapers and log requests?
Debugging Colly scrapers is essential for identifying issues, monitoring performance, and ensuring your web scraping projects run smoothly. Colly provides several built-in debugging features and allows for custom logging implementations to help you track requests, responses, and potential errors.
Built-in Debugging Features
Enabling Debug Mode
Colly offers a simple way to enable debug mode, which automatically logs all requests and responses:
package main
import (
"fmt"
"log"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
c := colly.NewCollector(
colly.Debugger(&debug.LogDebugger{}),
)
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println("Title:", e.Text)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("https://example.com")
}
Custom Debug Logger
For more control over debug output, you can implement a custom debugger:
package main
import (
"fmt"
"log"
"os"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
type CustomDebugger struct {
logger *log.Logger
}
func (d *CustomDebugger) Init() error {
d.logger = log.New(os.Stdout, "[COLLY DEBUG] ", log.LstdFlags)
return nil
}
func (d *CustomDebugger) Event(e *debug.Event) {
d.logger.Printf("Type: %s, RequestID: %d, URL: %s",
e.Type, e.RequestID, e.Request.URL)
}
func main() {
c := colly.NewCollector(
colly.Debugger(&CustomDebugger{}),
)
// Your scraping logic here
c.Visit("https://example.com")
}
Request and Response Logging
Comprehensive Request Logging
Track all aspects of HTTP requests including headers, timing, and response codes:
package main
import (
"fmt"
"log"
"time"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Log before request is sent
c.OnRequest(func(r *colly.Request) {
log.Printf("Requesting: %s", r.URL)
log.Printf("Method: %s", r.Method)
log.Printf("Headers: %v", r.Headers)
// Add timestamp to context for duration calculation
r.Ctx.Put("start_time", time.Now())
})
// Log response details
c.OnResponse(func(r *colly.Response) {
startTime := r.Ctx.GetAny("start_time").(time.Time)
duration := time.Since(startTime)
log.Printf("Response from %s:", r.Request.URL)
log.Printf("Status Code: %d", r.StatusCode)
log.Printf("Content Length: %d bytes", len(r.Body))
log.Printf("Duration: %v", duration)
log.Printf("Response Headers: %v", r.Headers)
})
// Log errors
c.OnError(func(r *colly.Response, err error) {
log.Printf("Error on %s: %v", r.Request.URL, err)
log.Printf("Status Code: %d", r.StatusCode)
})
c.Visit("https://example.com")
}
File-based Logging
Store logs in files for later analysis:
package main
import (
"encoding/json"
"log"
"os"
"time"
"github.com/gocolly/colly/v2"
)
type RequestLog struct {
URL string `json:"url"`
Method string `json:"method"`
StatusCode int `json:"status_code"`
Duration time.Duration `json:"duration"`
Error string `json:"error,omitempty"`
Timestamp time.Time `json:"timestamp"`
Headers map[string]string `json:"headers"`
}
func main() {
// Create log file
logFile, err := os.OpenFile("colly_requests.log",
os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
if err != nil {
log.Fatal(err)
}
defer logFile.Close()
logger := log.New(logFile, "", 0)
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
r.Ctx.Put("start_time", time.Now())
})
c.OnResponse(func(r *colly.Response) {
startTime := r.Ctx.GetAny("start_time").(time.Time)
requestLog := RequestLog{
URL: r.Request.URL.String(),
Method: r.Request.Method,
StatusCode: r.StatusCode,
Duration: time.Since(startTime),
Timestamp: time.Now(),
Headers: make(map[string]string),
}
// Convert headers to map
for key, values := range r.Headers {
if len(values) > 0 {
requestLog.Headers[key] = values[0]
}
}
logData, _ := json.Marshal(requestLog)
logger.Println(string(logData))
})
c.OnError(func(r *colly.Response, err error) {
startTime := r.Ctx.GetAny("start_time").(time.Time)
requestLog := RequestLog{
URL: r.Request.URL.String(),
Method: r.Request.Method,
Duration: time.Since(startTime),
Error: err.Error(),
Timestamp: time.Now(),
}
if r != nil {
requestLog.StatusCode = r.StatusCode
}
logData, _ := json.Marshal(requestLog)
logger.Println(string(logData))
})
c.Visit("https://example.com")
}
Advanced Debugging Techniques
Network Request Monitoring
Monitor network-level details including DNS resolution and connection times:
package main
import (
"crypto/tls"
"fmt"
"net/http"
"net/http/httptrace"
"time"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Custom transport with detailed tracing
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
c.OnRequest(func(r *colly.Request) {
// Add detailed tracing to the request
trace := &httptrace.ClientTrace{
DNSStart: func(info httptrace.DNSStartInfo) {
fmt.Printf("DNS lookup started for %s\n", info.Host)
},
DNSDone: func(info httptrace.DNSDoneInfo) {
fmt.Printf("DNS lookup completed: %v\n", info.Addrs)
},
ConnectStart: func(network, addr string) {
fmt.Printf("Connection started to %s\n", addr)
},
ConnectDone: func(network, addr string, err error) {
if err != nil {
fmt.Printf("Connection failed to %s: %v\n", addr, err)
} else {
fmt.Printf("Connection established to %s\n", addr)
}
},
TLSHandshakeStart: func() {
fmt.Println("TLS handshake started")
},
TLSHandshakeDone: func(state tls.ConnectionState, err error) {
if err != nil {
fmt.Printf("TLS handshake failed: %v\n", err)
} else {
fmt.Println("TLS handshake completed")
}
},
}
r.Headers.Set("User-Agent", "Colly Debug Bot")
ctx := httptrace.WithClientTrace(r.Ctx, trace)
r.Ctx = ctx
})
c.OnTransport(transport)
c.Visit("https://example.com")
}
Memory and Performance Monitoring
Track memory usage and performance metrics:
package main
import (
"fmt"
"runtime"
"time"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
var requestCount int
startTime := time.Now()
c.OnRequest(func(r *colly.Request) {
requestCount++
var m runtime.MemStats
runtime.ReadMemStats(&m)
fmt.Printf("Request #%d to %s\n", requestCount, r.URL)
fmt.Printf("Memory Usage: %.2f MB\n",
float64(m.Alloc)/1024/1024)
fmt.Printf("Goroutines: %d\n", runtime.NumGoroutine())
})
c.OnResponse(func(r *colly.Response) {
elapsed := time.Since(startTime)
rate := float64(requestCount) / elapsed.Seconds()
fmt.Printf("Total requests: %d\n", requestCount)
fmt.Printf("Rate: %.2f requests/second\n", rate)
fmt.Printf("Total time: %v\n", elapsed)
})
c.Visit("https://example.com")
}
Debugging Common Issues
Handling Rate Limiting and Retries
Debug rate limiting issues and implement retry logic:
package main
import (
"fmt"
"log"
"time"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Add rate limiting
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 2,
Delay: 1 * time.Second,
})
c.OnRequest(func(r *colly.Request) {
log.Printf("Requesting: %s", r.URL)
// Add retry counter to context
retryCount := r.Ctx.GetAny("retry_count")
if retryCount == nil {
r.Ctx.Put("retry_count", 0)
}
})
c.OnError(func(r *colly.Response, err error) {
retryCount := r.Ctx.GetAny("retry_count").(int)
maxRetries := 3
log.Printf("Error on %s (attempt %d): %v",
r.Request.URL, retryCount+1, err)
if retryCount < maxRetries {
log.Printf("Retrying request to %s", r.Request.URL)
r.Request.Ctx.Put("retry_count", retryCount+1)
// Wait before retry
time.Sleep(time.Duration(retryCount+1) * time.Second)
r.Request.Retry()
} else {
log.Printf("Max retries exceeded for %s", r.Request.URL)
}
})
c.Visit("https://example.com")
}
Cookie and Session Debugging
Debug cookie handling and session management:
package main
import (
"fmt"
"net/http"
"net/http/cookiejar"
"net/url"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Enable cookie jar
jar, _ := cookiejar.New(nil)
c.SetCookieJar(jar)
c.OnRequest(func(r *colly.Request) {
fmt.Printf("Request to: %s\n", r.URL)
// Log cookies being sent
if cookies := jar.Cookies(r.URL); len(cookies) > 0 {
fmt.Println("Sending cookies:")
for _, cookie := range cookies {
fmt.Printf(" %s=%s\n", cookie.Name, cookie.Value)
}
}
})
c.OnResponse(func(r *colly.Response) {
fmt.Printf("Response from: %s\n", r.Request.URL)
// Log cookies received
if cookies := r.Headers["Set-Cookie"]; len(cookies) > 0 {
fmt.Println("Received cookies:")
for _, cookie := range cookies {
fmt.Printf(" %s\n", cookie)
}
}
// Show current cookie jar state
u, _ := url.Parse(r.Request.URL.String())
if jarCookies := jar.Cookies(u); len(jarCookies) > 0 {
fmt.Println("Current cookie jar:")
for _, cookie := range jarCookies {
fmt.Printf(" %s=%s (expires: %s)\n",
cookie.Name, cookie.Value, cookie.Expires)
}
}
})
c.Visit("https://example.com")
}
Integration with External Tools
Structured Logging with Logrus
Use structured logging for better log analysis:
package main
import (
"time"
"github.com/gocolly/colly/v2"
"github.com/sirupsen/logrus"
)
func main() {
// Configure logrus
logger := logrus.New()
logger.SetFormatter(&logrus.JSONFormatter{})
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
logger.WithFields(logrus.Fields{
"event": "request_start",
"url": r.URL.String(),
"method": r.Method,
}).Info("Starting request")
r.Ctx.Put("start_time", time.Now())
})
c.OnResponse(func(r *colly.Response) {
startTime := r.Ctx.GetAny("start_time").(time.Time)
logger.WithFields(logrus.Fields{
"event": "request_complete",
"url": r.Request.URL.String(),
"status_code": r.StatusCode,
"content_length": len(r.Body),
"duration_ms": time.Since(startTime).Milliseconds(),
}).Info("Request completed")
})
c.OnError(func(r *colly.Response, err error) {
logger.WithFields(logrus.Fields{
"event": "request_error",
"url": r.Request.URL.String(),
"error": err.Error(),
}).Error("Request failed")
})
c.Visit("https://example.com")
}
Best Practices for Debugging
Environment-based Logging
Implement different logging levels for development and production:
package main
import (
"os"
"strings"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
"github.com/sirupsen/logrus"
)
func setupCollector() *colly.Collector {
var c *colly.Collector
env := strings.ToLower(os.Getenv("ENVIRONMENT"))
if env == "development" || env == "debug" {
// Enable verbose debugging in development
c = colly.NewCollector(
colly.Debugger(&debug.LogDebugger{}),
)
logrus.SetLevel(logrus.DebugLevel)
} else {
// Production setup with minimal logging
c = colly.NewCollector()
logrus.SetLevel(logrus.WarnLevel)
}
return c
}
func main() {
c := setupCollector()
// Add your scraping logic
c.OnHTML("title", func(e *colly.HTMLElement) {
logrus.Info("Found title: ", e.Text)
})
c.Visit("https://example.com")
}
Debugging Colly scrapers effectively requires a combination of built-in features, custom logging, and monitoring techniques. By implementing comprehensive logging and debugging strategies, you can identify issues quickly, optimize performance, and ensure reliable web scraping operations. Similar to how you monitor network requests in Puppeteer, monitoring HTTP requests in Colly provides valuable insights into your scraper's behavior and helps troubleshoot connectivity issues.
Remember to adjust your debugging level based on your environment and always implement proper error handling to make your scrapers more robust and maintainable.