Table of contents

How do I Handle Pagination in Go Web Scraping?

Pagination is one of the most common challenges in web scraping. Websites split content across multiple pages to improve performance and user experience, but this creates complexity for scrapers. In Go, there are several effective strategies to handle different types of pagination patterns.

Understanding Pagination Types

Before diving into implementation, it's important to understand the main pagination patterns you'll encounter:

  1. Numbered pagination - Traditional page numbers (1, 2, 3...)
  2. Next/Previous buttons - Links to navigate between pages
  3. Infinite scroll - Dynamic loading of content as you scroll
  4. Load more buttons - Click-based content loading
  5. Offset-based pagination - URL parameters like ?page=2&limit=20

Basic Pagination Setup in Go

Let's start with a foundational structure for handling pagination:

package main

import (
    "fmt"
    "log"
    "net/http"
    "strconv"
    "time"

    "github.com/PuerkitoBio/goquery"
)

type Scraper struct {
    client      *http.Client
    baseURL     string
    currentPage int
    maxPages    int
    delay       time.Duration
}

func NewScraper(baseURL string, maxPages int) *Scraper {
    return &Scraper{
        client: &http.Client{
            Timeout: 30 * time.Second,
        },
        baseURL:     baseURL,
        currentPage: 1,
        maxPages:    maxPages,
        delay:       1 * time.Second,
    }
}

func (s *Scraper) fetchPage(url string) (*goquery.Document, error) {
    // Add delay to respect rate limits
    time.Sleep(s.delay)

    resp, err := s.client.Get(url)
    if err != nil {
        return nil, fmt.Errorf("failed to fetch page: %w", err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
    }

    doc, err := goquery.NewDocumentFromReader(resp.Body)
    if err != nil {
        return nil, fmt.Errorf("failed to parse HTML: %w", err)
    }

    return doc, nil
}

Handling Numbered Pagination

This is the most straightforward pagination pattern where pages are accessed via URL parameters:

func (s *Scraper) scrapeNumberedPagination() error {
    for page := 1; page <= s.maxPages; page++ {
        url := fmt.Sprintf("%s?page=%d", s.baseURL, page)

        doc, err := s.fetchPage(url)
        if err != nil {
            log.Printf("Error fetching page %d: %v", page, err)
            continue
        }

        // Extract data from the current page
        data := s.extractData(doc)
        if len(data) == 0 {
            log.Printf("No data found on page %d, stopping", page)
            break
        }

        // Process the extracted data
        s.processData(data)

        log.Printf("Successfully scraped page %d", page)
    }

    return nil
}

func (s *Scraper) extractData(doc *goquery.Document) []string {
    var items []string

    doc.Find(".item-selector").Each(func(i int, sel *goquery.Selection) {
        text := sel.Text()
        if text != "" {
            items = append(items, text)
        }
    })

    return items
}

func (s *Scraper) processData(data []string) {
    for _, item := range data {
        fmt.Printf("Found item: %s\n", item)
    }
}

Dynamic Next Page Detection

For more robust pagination handling, implement dynamic next page detection:

func (s *Scraper) scrapeWithNextPageDetection() error {
    currentURL := s.baseURL
    pageCount := 0

    for pageCount < s.maxPages {
        doc, err := s.fetchPage(currentURL)
        if err != nil {
            return fmt.Errorf("failed to fetch page: %w", err)
        }

        // Extract data from current page
        data := s.extractData(doc)
        if len(data) == 0 {
            log.Println("No more data found, stopping pagination")
            break
        }

        s.processData(data)
        pageCount++

        // Find next page URL
        nextURL, exists := s.findNextPageURL(doc)
        if !exists {
            log.Println("No next page found, pagination complete")
            break
        }

        currentURL = nextURL
        log.Printf("Moving to next page: %s", currentURL)
    }

    return nil
}

func (s *Scraper) findNextPageURL(doc *goquery.Document) (string, bool) {
    // Look for common next page selectors
    selectors := []string{
        "a[rel='next']",
        ".next-page",
        ".pagination .next",
        "a:contains('Next')",
        "a:contains('→')",
    }

    for _, selector := range selectors {
        nextLink := doc.Find(selector).First()
        if nextLink.Length() > 0 {
            href, exists := nextLink.Attr("href")
            if exists {
                return s.resolveURL(href), true
            }
        }
    }

    return "", false
}

func (s *Scraper) resolveURL(href string) string {
    // Handle relative URLs
    if href[0] == '/' {
        return s.baseURL + href
    }
    return href
}

Advanced Pagination with Context and Cancellation

For production applications, implement proper context handling and cancellation:

import (
    "context"
    "sync"
)

func (s *Scraper) scrapeWithContext(ctx context.Context, results chan<- []string) error {
    defer close(results)

    currentURL := s.baseURL
    pageCount := 0

    for pageCount < s.maxPages {
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
        }

        doc, err := s.fetchPageWithContext(ctx, currentURL)
        if err != nil {
            return err
        }

        data := s.extractData(doc)
        if len(data) == 0 {
            break
        }

        // Send data to channel
        select {
        case results <- data:
        case <-ctx.Done():
            return ctx.Err()
        }

        nextURL, exists := s.findNextPageURL(doc)
        if !exists {
            break
        }

        currentURL = nextURL
        pageCount++
    }

    return nil
}

func (s *Scraper) fetchPageWithContext(ctx context.Context, url string) (*goquery.Document, error) {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return nil, err
    }

    time.Sleep(s.delay)

    resp, err := s.client.Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    return goquery.NewDocumentFromReader(resp.Body)
}

Handling Complex Pagination Patterns

Some websites use more complex pagination patterns. Here's how to handle offset-based pagination:

type OffsetPaginator struct {
    baseURL string
    client  *http.Client
    limit   int
    offset  int
}

func (op *OffsetPaginator) scrapeAllPages() error {
    for {
        url := fmt.Sprintf("%s?limit=%d&offset=%d", op.baseURL, op.limit, op.offset)

        doc, err := op.fetchPage(url)
        if err != nil {
            return err
        }

        items := op.extractItems(doc)
        if len(items) == 0 {
            break // No more items
        }

        op.processItems(items)
        op.offset += op.limit

        // Check if we've reached the end
        if len(items) < op.limit {
            break
        }
    }

    return nil
}

Concurrent Pagination Processing

For better performance, implement concurrent page processing:

func (s *Scraper) scrapeConcurrently(maxWorkers int) error {
    urls := make(chan string, 100)
    results := make(chan []string, 100)
    var wg sync.WaitGroup

    // Start workers
    for i := 0; i < maxWorkers; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for url := range urls {
                doc, err := s.fetchPage(url)
                if err != nil {
                    log.Printf("Error fetching %s: %v", url, err)
                    continue
                }

                data := s.extractData(doc)
                if len(data) > 0 {
                    results <- data
                }
            }
        }()
    }

    // Generate URLs
    go func() {
        defer close(urls)
        for page := 1; page <= s.maxPages; page++ {
            url := fmt.Sprintf("%s?page=%d", s.baseURL, page)
            urls <- url
        }
    }()

    // Collect results
    go func() {
        wg.Wait()
        close(results)
    }()

    for data := range results {
        s.processData(data)
    }

    return nil
}

Best Practices and Tips

1. Respect Rate Limits

Always implement delays between requests to avoid overwhelming the server:

type RateLimiter struct {
    ticker *time.Ticker
}

func NewRateLimiter(requestsPerSecond float64) *RateLimiter {
    interval := time.Duration(float64(time.Second) / requestsPerSecond)
    return &RateLimiter{
        ticker: time.NewTicker(interval),
    }
}

func (rl *RateLimiter) Wait() {
    <-rl.ticker.C
}

2. Handle Errors Gracefully

Implement retry logic for failed requests:

func (s *Scraper) fetchWithRetry(url string, maxRetries int) (*goquery.Document, error) {
    var lastErr error

    for attempt := 0; attempt <= maxRetries; attempt++ {
        doc, err := s.fetchPage(url)
        if err == nil {
            return doc, nil
        }

        lastErr = err
        if attempt < maxRetries {
            waitTime := time.Duration(attempt+1) * time.Second
            log.Printf("Retry %d/%d for %s after %v", attempt+1, maxRetries, url, waitTime)
            time.Sleep(waitTime)
        }
    }

    return nil, fmt.Errorf("failed after %d retries: %w", maxRetries, lastErr)
}

3. Monitor Progress

Implement progress tracking for long-running scraping jobs:

type ProgressTracker struct {
    total     int
    current   int
    startTime time.Time
    mu        sync.Mutex
}

func (pt *ProgressTracker) Update() {
    pt.mu.Lock()
    defer pt.mu.Unlock()

    pt.current++
    elapsed := time.Since(pt.startTime)
    rate := float64(pt.current) / elapsed.Seconds()
    remaining := time.Duration(float64(pt.total-pt.current) / rate * float64(time.Second))

    fmt.Printf("Progress: %d/%d (%.1f%%) - Rate: %.1f pages/sec - ETA: %v\n",
        pt.current, pt.total, float64(pt.current)/float64(pt.total)*100, rate, remaining)
}

Using WebScraping.AI for JavaScript-Heavy Sites

For complex pagination scenarios involving JavaScript-rendered content, consider handling dynamic content that loads after page load in JavaScript. This is particularly useful when dealing with single-page applications or infinite scroll implementations that require browser automation.

Testing Your Pagination Logic

Before deploying your scraper, test it thoroughly:

# Test with a small page limit first
go run scraper.go -pages=3 -delay=2s

# Monitor network usage
go run scraper.go -verbose -pages=10

# Test error handling with unreliable network
go run scraper.go -retry=3 -timeout=30s

Conclusion

Handling pagination in Go web scraping requires a systematic approach that adapts to different pagination patterns. By implementing robust error handling, rate limiting, and concurrent processing, you can build scalable scrapers that efficiently navigate through paginated content. Remember to always respect the website's terms of service and implement appropriate delays to avoid overwhelming the server.

The key is to start with a simple approach and gradually add complexity as needed. Monitor your scraper's performance and adjust the concurrency and delay parameters based on the target website's capabilities and your infrastructure resources.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon