How to Scrape Data from Websites with Complex Pagination

Pagination is one of the most common challenges in web scraping. Websites implement various pagination patterns to improve user experience and manage large datasets. This comprehensive guide covers techniques for handling different types of pagination using JavaScript tools like Puppeteer, Playwright, and traditional HTTP clients.

Understanding Pagination Types

Before diving into implementation, it's crucial to understand the different pagination patterns you'll encounter:

1. Traditional Numbered Pagination

Classic pagination with numbered page links (1, 2, 3, Next, Previous).

2. Infinite Scroll Pagination

Content loads automatically as users scroll down, common on social media platforms.

3. "Load More" Button Pagination

Users click a button to load additional content dynamically.

4. AJAX-Based Pagination

Pages load content asynchronously without full page refreshes.

5. URL Parameter Pagination

Pagination controlled through URL parameters like ?page=2 or ?offset=20.

Handling Traditional Numbered Pagination

For websites with numbered pagination, you can iterate through pages systematically:

Using Puppeteer

const puppeteer = require('puppeteer');

async function scrapeNumberedPagination() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    let currentPage = 1;
    let hasNextPage = true;
    const allData = [];

    while (hasNextPage) {
        console.log(`Scraping page ${currentPage}...`);

        // Navigate to current page
        await page.goto(`https://example.com/products?page=${currentPage}`);

        // Wait for content to load
        await page.waitForSelector('.product-item');

        // Extract data from current page
        const pageData = await page.evaluate(() => {
            return Array.from(document.querySelectorAll('.product-item')).map(item => ({
                title: item.querySelector('.title')?.textContent?.trim(),
                price: item.querySelector('.price')?.textContent?.trim(),
                url: item.querySelector('a')?.href
            }));
        });

        allData.push(...pageData);

        // Check if next page exists
        const nextButton = await page.$('.pagination .next:not(.disabled)');
        if (!nextButton) {
            hasNextPage = false;
        } else {
            currentPage++;
        }

        // Add delay to be respectful
        await page.waitForTimeout(2000);
    }

    await browser.close();
    return allData;
}

Using Axios for API-Based Pagination

const axios = require('axios');

async function scrapeApiPagination() {
    let currentPage = 1;
    let hasNextPage = true;
    const allData = [];

    while (hasNextPage) {
        try {
            const response = await axios.get(`https://api.example.com/products`, {
                params: {
                    page: currentPage,
                    limit: 20
                },
                headers: {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                }
            });

            const { data, pagination } = response.data;
            allData.push(...data);

            // Check if there are more pages
            hasNextPage = currentPage < pagination.totalPages;
            currentPage++;

            // Rate limiting
            await new Promise(resolve => setTimeout(resolve, 1000));

        } catch (error) {
            console.error(`Error on page ${currentPage}:`, error.message);
            break;
        }
    }

    return allData;
}

Handling Infinite Scroll Pagination

Infinite scroll requires simulating user scrolling behavior to trigger content loading:

async function scrapeInfiniteScroll() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    await page.goto('https://example.com/infinite-scroll');

    let previousHeight;
    let currentHeight = await page.evaluate('document.body.scrollHeight');

    while (previousHeight !== currentHeight) {
        previousHeight = currentHeight;

        // Scroll to bottom
        await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');

        // Wait for new content to load
        await page.waitForFunction(
            `document.body.scrollHeight > ${currentHeight}`,
            { timeout: 10000 }
        ).catch(() => {
            console.log('No more content to load');
        });

        currentHeight = await page.evaluate('document.body.scrollHeight');

        // Optional: wait for specific elements to appear
        await page.waitForTimeout(2000);
    }

    // Extract all data after loading everything
    const allData = await page.evaluate(() => {
        return Array.from(document.querySelectorAll('.item')).map(item => ({
            text: item.textContent.trim(),
            link: item.querySelector('a')?.href
        }));
    });

    await browser.close();
    return allData;
}

Handling "Load More" Button Pagination

Many websites use "Load More" buttons instead of infinite scroll:

async function scrapeLoadMorePagination() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    await page.goto('https://example.com/load-more');
    await page.waitForSelector('.item');

    let loadMoreButton;

    do {
        // Wait for load more button to be available
        loadMoreButton = await page.$('.load-more-btn:not(.disabled)');

        if (loadMoreButton) {
            // Click the load more button
            await loadMoreButton.click();

            // Wait for new content to load
            await page.waitForFunction(() => {
                const loader = document.querySelector('.loading');
                return !loader || loader.style.display === 'none';
            }, { timeout: 10000 });

            // Optional: wait for new items to appear
            await page.waitForTimeout(1000);
        }
    } while (loadMoreButton);

    // Extract all loaded data
    const allData = await page.evaluate(() => {
        return Array.from(document.querySelectorAll('.item')).map(item => ({
            title: item.querySelector('.title')?.textContent?.trim(),
            description: item.querySelector('.description')?.textContent?.trim()
        }));
    });

    await browser.close();
    return allData;
}

Handling AJAX Pagination

For AJAX-based pagination, you can intercept network requests to understand the pagination API:

async function scrapeAjaxPagination() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    // Enable request interception
    await page.setRequestInterception(true);

    const apiRequests = [];

    page.on('request', request => {
        if (request.url().includes('/api/') && request.method() === 'GET') {
            apiRequests.push({
                url: request.url(),
                headers: request.headers()
            });
        }
        request.continue();
    });

    await page.goto('https://example.com/ajax-pagination');

    // Click through several pages to understand the pattern
    for (let i = 1; i <= 3; i++) {
        await page.click(`[data-page="${i}"]`);
        await page.waitForTimeout(2000);
    }

    await browser.close();

    // Now use the discovered API endpoints directly
    return await scrapeUsingDiscoveredApi(apiRequests);
}

async function scrapeUsingDiscoveredApi(apiRequests) {
    const allData = [];

    // Extract pagination pattern from intercepted requests
    const baseUrl = apiRequests[0].url.split('?')[0];
    const sampleHeaders = apiRequests[0].headers;

    let page = 1;
    let hasMore = true;

    while (hasMore) {
        try {
            const response = await axios.get(baseUrl, {
                params: { page, limit: 20 },
                headers: sampleHeaders
            });

            const data = response.data;

            if (data.items && data.items.length > 0) {
                allData.push(...data.items);
                page++;
            } else {
                hasMore = false;
            }

            await new Promise(resolve => setTimeout(resolve, 1000));

        } catch (error) {
            console.error('API request failed:', error.message);
            break;
        }
    }

    return allData;
}

Advanced Pagination Strategies

Handling Dynamic Pagination with State Management

Some websites use complex state management for pagination. Here's how to handle them:

async function scrapeStatefulPagination() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    await page.goto('https://example.com/complex-pagination');

    // Wait for initial load
    await page.waitForSelector('[data-testid="pagination-container"]');

    const allData = [];
    let pageCount = 0;

    while (true) {
        // Wait for content to stabilize
        await page.waitForFunction(() => {
            const loader = document.querySelector('.loading-spinner');
            return !loader || getComputedStyle(loader).display === 'none';
        });

        // Extract current page data
        const pageData = await page.evaluate(() => {
            return Array.from(document.querySelectorAll('.data-row')).map(row => ({
                id: row.dataset.id,
                content: row.textContent.trim()
            }));
        });

        if (pageData.length === 0) break;

        allData.push(...pageData);
        pageCount++;

        // Try to navigate to next page
        const nextButton = await page.$('[data-testid="next-page"]:not([disabled])');
        if (!nextButton) break;

        await nextButton.click();

        // Wait for URL or state change
        await page.waitForFunction(
            (expectedPage) => {
                return window.location.search.includes(`page=${expectedPage}`) ||
                       document.querySelector('[data-current-page]')?.dataset.currentPage === expectedPage.toString();
            },
            {},
            pageCount + 1
        );
    }

    await browser.close();
    return allData;
}

Error Handling and Retry Logic

Robust pagination scraping requires comprehensive error handling:

async function scrapeWithErrorHandling() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    const maxRetries = 3;
    const allData = [];
    let currentPage = 1;

    while (true) {
        let success = false;
        let retries = 0;

        while (!success && retries < maxRetries) {
            try {
                await page.goto(`https://example.com/data?page=${currentPage}`, {
                    waitUntil: 'networkidle2',
                    timeout: 30000
                });

                // Check if page has data
                await page.waitForSelector('.data-container', { timeout: 10000 });

                const pageData = await page.evaluate(() => {
                    const items = document.querySelectorAll('.data-item');
                    if (items.length === 0) return null; // No more data

                    return Array.from(items).map(item => ({
                        title: item.querySelector('.title')?.textContent?.trim(),
                        url: item.querySelector('a')?.href
                    }));
                });

                if (!pageData) break; // No more pages

                allData.push(...pageData);
                success = true;
                currentPage++;

            } catch (error) {
                retries++;
                console.log(`Retry ${retries}/${maxRetries} for page ${currentPage}:`, error.message);

                if (retries < maxRetries) {
                    await page.waitForTimeout(5000 * retries); // Exponential backoff
                }
            }
        }

        if (!success) {
            console.error(`Failed to scrape page ${currentPage} after ${maxRetries} retries`);
            break;
        }

        // Respectful delay between pages
        await page.waitForTimeout(2000);
    }

    await browser.close();
    return allData;
}

Integration with WebScraping.AI

For complex pagination scenarios, you can leverage WebScraping.AI's capabilities to handle JavaScript-heavy pagination:

const axios = require('axios');

async function scrapeWithWebScrapingAI(url, pageParam = 'page') {
    const allData = [];
    let currentPage = 1;

    while (true) {
        const response = await axios.get('https://api.webscraping.ai/html', {
            params: {
                url: `${url}?${pageParam}=${currentPage}`,
                js: 'true',
                js_timeout: 5000
            },
            headers: {
                'Api-Key': 'your-api-key'
            }
        });

        // Parse the HTML response
        const cheerio = require('cheerio');
        const $ = cheerio.load(response.data);

        const items = $('.data-item').map((i, el) => ({
            title: $(el).find('.title').text().trim(),
            content: $(el).find('.content').text().trim()
        })).get();

        if (items.length === 0) break;

        allData.push(...items);
        currentPage++;

        // Rate limiting
        await new Promise(resolve => setTimeout(resolve, 1000));
    }

    return allData;
}

Best Practices and Performance Tips

1. Implement Rate Limiting

Always add delays between requests to avoid overwhelming the server:

const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
await delay(1000); // Wait 1 second between requests

2. Use Connection Pooling

For API-based pagination, reuse HTTP connections:

const axios = require('axios');
const axiosInstance = axios.create({
    timeout: 10000,
    headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; DataScraper/1.0)'
    }
});

3. Monitor Memory Usage

For large datasets, process data in chunks to avoid memory issues:

async function processInChunks(allData, chunkSize = 1000) {
    for (let i = 0; i < allData.length; i += chunkSize) {
        const chunk = allData.slice(i, i + chunkSize);
        await processChunk(chunk);
        // Clear processed data from memory
        allData.splice(0, chunkSize);
    }
}

4. Handle Network Failures Gracefully

Implement exponential backoff for network errors:

async function fetchWithRetry(url, options, maxRetries = 3) {
    for (let i = 0; i < maxRetries; i++) {
        try {
            return await fetch(url, options);
        } catch (error) {
            if (i === maxRetries - 1) throw error;
            await delay(Math.pow(2, i) * 1000); // Exponential backoff
        }
    }
}

Conclusion

Scraping websites with complex pagination requires understanding the specific pagination implementation and choosing the right approach. Whether dealing with traditional navigation patterns, infinite scroll, or AJAX-based loading, the key is to identify the pagination mechanism and implement appropriate waiting strategies.

For dynamic content that loads asynchronously, mastering AJAX request handling becomes crucial for successful data extraction. Remember to always implement proper error handling, respect rate limits, and test your scraping logic thoroughly across different pagination scenarios.

The examples provided in this guide should cover most pagination patterns you'll encounter. Adapt these techniques based on your specific use case and always monitor your scraping performance to ensure efficient and respectful data collection.

Table of contents