How do I Handle Pagination in Go Web Scraping?
Pagination is one of the most common challenges in web scraping. Websites split content across multiple pages to improve performance and user experience, but this creates complexity for scrapers. In Go, there are several effective strategies to handle different types of pagination patterns.
Understanding Pagination Types
Before diving into implementation, it's important to understand the main pagination patterns you'll encounter:
- Numbered pagination - Traditional page numbers (1, 2, 3...)
- Next/Previous buttons - Links to navigate between pages
- Infinite scroll - Dynamic loading of content as you scroll
- Load more buttons - Click-based content loading
- Offset-based pagination - URL parameters like
?page=2&limit=20
Basic Pagination Setup in Go
Let's start with a foundational structure for handling pagination:
package main
import (
"fmt"
"log"
"net/http"
"strconv"
"time"
"github.com/PuerkitoBio/goquery"
)
type Scraper struct {
client *http.Client
baseURL string
currentPage int
maxPages int
delay time.Duration
}
func NewScraper(baseURL string, maxPages int) *Scraper {
return &Scraper{
client: &http.Client{
Timeout: 30 * time.Second,
},
baseURL: baseURL,
currentPage: 1,
maxPages: maxPages,
delay: 1 * time.Second,
}
}
func (s *Scraper) fetchPage(url string) (*goquery.Document, error) {
// Add delay to respect rate limits
time.Sleep(s.delay)
resp, err := s.client.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to fetch page: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
return doc, nil
}
Handling Numbered Pagination
This is the most straightforward pagination pattern where pages are accessed via URL parameters:
func (s *Scraper) scrapeNumberedPagination() error {
for page := 1; page <= s.maxPages; page++ {
url := fmt.Sprintf("%s?page=%d", s.baseURL, page)
doc, err := s.fetchPage(url)
if err != nil {
log.Printf("Error fetching page %d: %v", page, err)
continue
}
// Extract data from the current page
data := s.extractData(doc)
if len(data) == 0 {
log.Printf("No data found on page %d, stopping", page)
break
}
// Process the extracted data
s.processData(data)
log.Printf("Successfully scraped page %d", page)
}
return nil
}
func (s *Scraper) extractData(doc *goquery.Document) []string {
var items []string
doc.Find(".item-selector").Each(func(i int, sel *goquery.Selection) {
text := sel.Text()
if text != "" {
items = append(items, text)
}
})
return items
}
func (s *Scraper) processData(data []string) {
for _, item := range data {
fmt.Printf("Found item: %s\n", item)
}
}
Dynamic Next Page Detection
For more robust pagination handling, implement dynamic next page detection:
func (s *Scraper) scrapeWithNextPageDetection() error {
currentURL := s.baseURL
pageCount := 0
for pageCount < s.maxPages {
doc, err := s.fetchPage(currentURL)
if err != nil {
return fmt.Errorf("failed to fetch page: %w", err)
}
// Extract data from current page
data := s.extractData(doc)
if len(data) == 0 {
log.Println("No more data found, stopping pagination")
break
}
s.processData(data)
pageCount++
// Find next page URL
nextURL, exists := s.findNextPageURL(doc)
if !exists {
log.Println("No next page found, pagination complete")
break
}
currentURL = nextURL
log.Printf("Moving to next page: %s", currentURL)
}
return nil
}
func (s *Scraper) findNextPageURL(doc *goquery.Document) (string, bool) {
// Look for common next page selectors
selectors := []string{
"a[rel='next']",
".next-page",
".pagination .next",
"a:contains('Next')",
"a:contains('→')",
}
for _, selector := range selectors {
nextLink := doc.Find(selector).First()
if nextLink.Length() > 0 {
href, exists := nextLink.Attr("href")
if exists {
return s.resolveURL(href), true
}
}
}
return "", false
}
func (s *Scraper) resolveURL(href string) string {
// Handle relative URLs
if href[0] == '/' {
return s.baseURL + href
}
return href
}
Advanced Pagination with Context and Cancellation
For production applications, implement proper context handling and cancellation:
import (
"context"
"sync"
)
func (s *Scraper) scrapeWithContext(ctx context.Context, results chan<- []string) error {
defer close(results)
currentURL := s.baseURL
pageCount := 0
for pageCount < s.maxPages {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
doc, err := s.fetchPageWithContext(ctx, currentURL)
if err != nil {
return err
}
data := s.extractData(doc)
if len(data) == 0 {
break
}
// Send data to channel
select {
case results <- data:
case <-ctx.Done():
return ctx.Err()
}
nextURL, exists := s.findNextPageURL(doc)
if !exists {
break
}
currentURL = nextURL
pageCount++
}
return nil
}
func (s *Scraper) fetchPageWithContext(ctx context.Context, url string) (*goquery.Document, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
time.Sleep(s.delay)
resp, err := s.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
return goquery.NewDocumentFromReader(resp.Body)
}
Handling Complex Pagination Patterns
Some websites use more complex pagination patterns. Here's how to handle offset-based pagination:
type OffsetPaginator struct {
baseURL string
client *http.Client
limit int
offset int
}
func (op *OffsetPaginator) scrapeAllPages() error {
for {
url := fmt.Sprintf("%s?limit=%d&offset=%d", op.baseURL, op.limit, op.offset)
doc, err := op.fetchPage(url)
if err != nil {
return err
}
items := op.extractItems(doc)
if len(items) == 0 {
break // No more items
}
op.processItems(items)
op.offset += op.limit
// Check if we've reached the end
if len(items) < op.limit {
break
}
}
return nil
}
Concurrent Pagination Processing
For better performance, implement concurrent page processing:
func (s *Scraper) scrapeConcurrently(maxWorkers int) error {
urls := make(chan string, 100)
results := make(chan []string, 100)
var wg sync.WaitGroup
// Start workers
for i := 0; i < maxWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for url := range urls {
doc, err := s.fetchPage(url)
if err != nil {
log.Printf("Error fetching %s: %v", url, err)
continue
}
data := s.extractData(doc)
if len(data) > 0 {
results <- data
}
}
}()
}
// Generate URLs
go func() {
defer close(urls)
for page := 1; page <= s.maxPages; page++ {
url := fmt.Sprintf("%s?page=%d", s.baseURL, page)
urls <- url
}
}()
// Collect results
go func() {
wg.Wait()
close(results)
}()
for data := range results {
s.processData(data)
}
return nil
}
Best Practices and Tips
1. Respect Rate Limits
Always implement delays between requests to avoid overwhelming the server:
type RateLimiter struct {
ticker *time.Ticker
}
func NewRateLimiter(requestsPerSecond float64) *RateLimiter {
interval := time.Duration(float64(time.Second) / requestsPerSecond)
return &RateLimiter{
ticker: time.NewTicker(interval),
}
}
func (rl *RateLimiter) Wait() {
<-rl.ticker.C
}
2. Handle Errors Gracefully
Implement retry logic for failed requests:
func (s *Scraper) fetchWithRetry(url string, maxRetries int) (*goquery.Document, error) {
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
doc, err := s.fetchPage(url)
if err == nil {
return doc, nil
}
lastErr = err
if attempt < maxRetries {
waitTime := time.Duration(attempt+1) * time.Second
log.Printf("Retry %d/%d for %s after %v", attempt+1, maxRetries, url, waitTime)
time.Sleep(waitTime)
}
}
return nil, fmt.Errorf("failed after %d retries: %w", maxRetries, lastErr)
}
3. Monitor Progress
Implement progress tracking for long-running scraping jobs:
type ProgressTracker struct {
total int
current int
startTime time.Time
mu sync.Mutex
}
func (pt *ProgressTracker) Update() {
pt.mu.Lock()
defer pt.mu.Unlock()
pt.current++
elapsed := time.Since(pt.startTime)
rate := float64(pt.current) / elapsed.Seconds()
remaining := time.Duration(float64(pt.total-pt.current) / rate * float64(time.Second))
fmt.Printf("Progress: %d/%d (%.1f%%) - Rate: %.1f pages/sec - ETA: %v\n",
pt.current, pt.total, float64(pt.current)/float64(pt.total)*100, rate, remaining)
}
Using WebScraping.AI for JavaScript-Heavy Sites
For complex pagination scenarios involving JavaScript-rendered content, consider handling dynamic content that loads after page load in JavaScript. This is particularly useful when dealing with single-page applications or infinite scroll implementations that require browser automation.
Testing Your Pagination Logic
Before deploying your scraper, test it thoroughly:
# Test with a small page limit first
go run scraper.go -pages=3 -delay=2s
# Monitor network usage
go run scraper.go -verbose -pages=10
# Test error handling with unreliable network
go run scraper.go -retry=3 -timeout=30s
Conclusion
Handling pagination in Go web scraping requires a systematic approach that adapts to different pagination patterns. By implementing robust error handling, rate limiting, and concurrent processing, you can build scalable scrapers that efficiently navigate through paginated content. Remember to always respect the website's terms of service and implement appropriate delays to avoid overwhelming the server.
The key is to start with a simple approach and gradually add complexity as needed. Monitor your scraper's performance and adjust the concurrency and delay parameters based on the target website's capabilities and your infrastructure resources.