How to Scrape Data from Paginated Websites Using Python

Pagination is one of the most common challenges when scraping websites that display large datasets across multiple pages. Whether you're dealing with e-commerce product listings, search results, or blog archives, understanding how to navigate and extract data from paginated content is essential for comprehensive web scraping projects.

Understanding Pagination Types

Before diving into implementation, it's important to understand the different types of pagination you'll encounter:

1. URL-Based Pagination

The most common type where page numbers are reflected in the URL structure: - https://example.com/products?page=1 - https://example.com/blog/page/2/ - https://example.com/search?q=python&offset=20

2. JavaScript-Based Pagination

Pages that load content dynamically using AJAX requests or infinite scroll mechanisms.

3. Form-Based Pagination

Pagination controlled through form submissions with hidden fields or POST requests.

Method 1: Scraping URL-Based Pagination with Requests and BeautifulSoup

This is the most straightforward approach for websites with predictable URL patterns.

Basic Implementation

import requests
from bs4 import BeautifulSoup
import time
import csv

def scrape_paginated_site(base_url, max_pages=None):
    """
    Scrape data from a paginated website with URL-based pagination
    """
    all_data = []
    page = 1
    session = requests.Session()

    # Set headers to avoid blocking
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })

    while True:
        # Construct URL for current page
        url = f"{base_url}?page={page}"

        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract data from current page
            items = extract_page_data(soup)

            # Break if no items found (end of pagination)
            if not items:
                print(f"No more items found on page {page}")
                break

            all_data.extend(items)
            print(f"Scraped page {page}: {len(items)} items")

            # Check if we've reached max pages
            if max_pages and page >= max_pages:
                break

            # Check for "Next" button or last page indicator
            if not has_next_page(soup):
                print("Reached last page")
                break

            page += 1

            # Rate limiting
            time.sleep(1)

        except requests.RequestException as e:
            print(f"Error scraping page {page}: {e}")
            break

    return all_data

def extract_page_data(soup):
    """
    Extract data from a single page
    """
    items = []

    # Adjust selector based on target website structure
    for item in soup.select('.product-item'):  # Example selector
        data = {
            'title': item.select_one('.title')?.get_text(strip=True),
            'price': item.select_one('.price')?.get_text(strip=True),
            'url': item.select_one('a')?.get('href')
        }

        # Only add if essential data exists
        if data['title'] and data['price']:
            items.append(data)

    return items

def has_next_page(soup):
    """
    Check if there's a next page available
    """
    # Method 1: Check for "Next" button
    next_button = soup.select_one('a.next, .pagination .next')
    if next_button and 'disabled' not in next_button.get('class', []):
        return True

    # Method 2: Check for specific pagination indicators
    return False

# Usage example
if __name__ == "__main__":
    base_url = "https://example-store.com/products"
    data = scrape_paginated_site(base_url, max_pages=10)

    # Save to CSV
    if data:
        with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=data[0].keys())
            writer.writeheader()
            writer.writerows(data)

        print(f"Saved {len(data)} items to scraped_data.csv")

Advanced URL Pattern Detection

For more complex pagination patterns, you can implement automatic URL pattern detection:

import re
from urllib.parse import urljoin, urlparse, parse_qs

def detect_pagination_pattern(soup, current_url):
    """
    Automatically detect pagination pattern from current page
    """
    patterns = []

    # Look for pagination links
    pagination_links = soup.select('a[href*="page"], a[href*="p="], .pagination a')

    for link in pagination_links:
        href = link.get('href')
        if href:
            full_url = urljoin(current_url, href)

            # Extract potential page parameters
            parsed = urlparse(full_url)
            params = parse_qs(parsed.query)

            for param, values in params.items():
                if any(keyword in param.lower() for keyword in ['page', 'offset', 'start']):
                    try:
                        page_num = int(values[0])
                        patterns.append({
                            'param': param,
                            'value': page_num,
                            'url_template': full_url.replace(f"{param}={page_num}", f"{param}={{page}}")
                        })
                    except ValueError:
                        continue

    return patterns

Method 2: Handling JavaScript-Based Pagination with Selenium

For websites that use JavaScript to load content dynamically, Selenium WebDriver is the preferred approach.

Basic Selenium Implementation

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import json

def scrape_js_pagination(base_url, max_pages=None):
    """
    Scrape JavaScript-based paginated content using Selenium
    """
    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in background
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

    driver = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(driver, 10)

    all_data = []
    page = 1

    try:
        driver.get(base_url)

        while True:
            # Wait for content to load
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, "product-item")))

            # Extract data from current page
            items = extract_js_page_data(driver)

            if not items:
                print(f"No items found on page {page}")
                break

            all_data.extend(items)
            print(f"Scraped page {page}: {len(items)} items")

            if max_pages and page >= max_pages:
                break

            # Try to navigate to next page
            if not click_next_page(driver, wait):
                print("No more pages available")
                break

            page += 1

    except Exception as e:
        print(f"Error during scraping: {e}")

    finally:
        driver.quit()

    return all_data

def extract_js_page_data(driver):
    """
    Extract data from current page using Selenium
    """
    items = []

    # Find all product elements
    product_elements = driver.find_elements(By.CLASS_NAME, "product-item")

    for element in product_elements:
        try:
            data = {
                'title': element.find_element(By.CLASS_NAME, "title").text,
                'price': element.find_element(By.CLASS_NAME, "price").text,
                'url': element.find_element(By.TAG_NAME, "a").get_attribute('href')
            }
            items.append(data)
        except NoSuchElementException:
            continue  # Skip items with missing data

    return items

def click_next_page(driver, wait):
    """
    Click the next page button and wait for new content
    """
    try:
        # Method 1: Look for next button
        next_button = driver.find_element(By.CSS_SELECTOR, '.next, .pagination-next, [aria-label="Next"]')

        if 'disabled' in next_button.get_attribute('class'):
            return False

        # Scroll to button if needed
        driver.execute_script("arguments[0].scrollIntoView();", next_button)

        # Click and wait for page to update
        next_button.click()

        # Wait for new content (you may need to adjust this based on the site)
        wait.until(EC.staleness_of(driver.find_element(By.CLASS_NAME, "product-item")))

        return True

    except (NoSuchElementException, TimeoutException):
        return False

Handling Infinite Scroll

For infinite scroll pagination, you need a different approach:

def scrape_infinite_scroll(base_url, scroll_pause=2, max_scrolls=10):
    """
    Scrape content with infinite scroll pagination
    """
    chrome_options = Options()
    chrome_options.add_argument('--headless')

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(base_url)

        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_count = 0

        while scroll_count < max_scrolls:
            # Scroll to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Wait for new content to load
            time.sleep(scroll_pause)

            # Check if new content loaded
            new_height = driver.execute_script("return document.body.scrollHeight")

            if new_height == last_height:
                print("No more content to load")
                break

            last_height = new_height
            scroll_count += 1
            print(f"Scroll {scroll_count}: Page height is now {new_height}")

        # Extract all loaded data
        return extract_js_page_data(driver)

    finally:
        driver.quit()

Method 3: API-Based Pagination

Many modern websites load content through API calls. You can intercept these calls for more efficient scraping:

import requests
import json

def scrape_api_pagination(api_endpoint, params=None, max_pages=None):
    """
    Scrape paginated API endpoints
    """
    if params is None:
        params = {}

    all_data = []
    page = 1

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json',
    })

    while True:
        # Update page parameter
        current_params = params.copy()
        current_params['page'] = page

        try:
            response = session.get(api_endpoint, params=current_params, timeout=10)
            response.raise_for_status()

            data = response.json()

            # Extract items (adjust based on API structure)
            items = data.get('items', []) or data.get('results', []) or data.get('data', [])

            if not items:
                print(f"No items found on page {page}")
                break

            all_data.extend(items)
            print(f"API page {page}: {len(items)} items")

            # Check pagination info
            pagination = data.get('pagination', {})
            if not pagination.get('has_next', True) or page >= pagination.get('total_pages', float('inf')):
                break

            if max_pages and page >= max_pages:
                break

            page += 1
            time.sleep(0.5)  # Rate limiting

        except requests.RequestException as e:
            print(f"API error on page {page}: {e}")
            break

    return all_data

Best Practices and Error Handling

Robust Error Handling

def robust_scraper(base_url, max_retries=3):
    """
    Scraper with comprehensive error handling
    """
    for attempt in range(max_retries):
        try:
            return scrape_paginated_site(base_url)
        except requests.ConnectionError:
            print(f"Connection error, attempt {attempt + 1}/{max_retries}")
            time.sleep(2 ** attempt)  # Exponential backoff
        except requests.Timeout:
            print(f"Timeout error, attempt {attempt + 1}/{max_retries}")
            time.sleep(1)
        except Exception as e:
            print(f"Unexpected error: {e}")
            break

    return []

Rate Limiting and Respectful Scraping

import random
from time import sleep

def respectful_delay(min_delay=1, max_delay=3):
    """
    Add random delay between requests
    """
    delay = random.uniform(min_delay, max_delay)
    sleep(delay)

# Use in your scraping loop
for page in range(1, max_pages + 1):
    # ... scraping logic ...
    respectful_delay(1, 2)  # Random delay between 1-2 seconds

Advanced Techniques

Concurrent Scraping

For better performance when scraping multiple independent pages:

import concurrent.futures
import threading

def scrape_page_worker(page_url):
    """
    Worker function for concurrent scraping
    """
    session = requests.Session()
    try:
        response = session.get(page_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        return extract_page_data(soup)
    except Exception as e:
        print(f"Error scraping {page_url}: {e}")
        return []

def concurrent_pagination_scraper(base_url, max_pages=10, max_workers=5):
    """
    Scrape multiple pages concurrently
    """
    urls = [f"{base_url}?page={page}" for page in range(1, max_pages + 1)]
    all_data = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(scrape_page_worker, url): url for url in urls}

        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                all_data.extend(data)
                print(f"Completed {url}: {len(data)} items")
            except Exception as e:
                print(f"Error with {url}: {e}")

    return all_data

JavaScript Framework Support

Working with React/Vue.js Applications

Many modern web applications use JavaScript frameworks that render content client-side. Here's how to handle them:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def wait_for_react_load(driver):
    """
    Wait for React application to fully load
    """
    # Wait for React to be available
    WebDriverWait(driver, 10).until(
        lambda driver: driver.execute_script("return typeof React !== 'undefined'")
    )

    # Wait for specific elements to be rendered
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='product-list']"))
    )

Handling Dynamic Loading

Detecting When Content Finishes Loading

def wait_for_content_load(driver, timeout=30):
    """
    Wait for dynamic content to finish loading
    """
    # Method 1: Wait for network activity to settle
    driver.execute_script("""
        window.pendingRequests = 0;
        const originalXHR = window.XMLHttpRequest;
        window.XMLHttpRequest = function() {
            const xhr = new originalXHR();
            window.pendingRequests++;
            xhr.addEventListener('loadend', () => window.pendingRequests--);
            return xhr;
        };
    """)

    # Wait for all requests to complete
    WebDriverWait(driver, timeout).until(
        lambda driver: driver.execute_script("return window.pendingRequests === 0")
    )

    # Method 2: Wait for specific loading indicators to disappear
    try:
        WebDriverWait(driver, 5).until(
            EC.invisibility_of_element_located((By.CLASS_NAME, "loading-spinner"))
        )
    except TimeoutException:
        pass  # No loading spinner found

Command Line Interface

Create a command-line tool for pagination scraping:

# Install required packages
pip install requests beautifulsoup4 selenium click

# Run the scraper
python pagination_scraper.py --url "https://example.com/products" --max-pages 10

import click

@click.command()
@click.option('--url', required=True, help='Base URL to scrape')
@click.option('--max-pages', default=None, type=int, help='Maximum pages to scrape')
@click.option('--output', default='output.csv', help='Output file name')
@click.option('--delay', default=1, help='Delay between requests (seconds)')
def scrape_cli(url, max_pages, output, delay):
    """
    Command line interface for pagination scraping
    """
    click.echo(f"Scraping {url} with max pages: {max_pages}")

    # Configure global delay
    global REQUEST_DELAY
    REQUEST_DELAY = delay

    # Run scraper
    data = scrape_paginated_site(url, max_pages)

    # Save results
    if data:
        save_to_csv(data, output)
        click.echo(f"Saved {len(data)} items to {output}")
    else:
        click.echo("No data found")

if __name__ == '__main__':
    scrape_cli()

Conclusion

Scraping paginated websites requires different approaches depending on the pagination implementation. Start with simple URL-based methods using requests and BeautifulSoup for static content. For JavaScript-heavy sites, leverage Selenium WebDriver to handle dynamic content loading. When possible, identify and use underlying API endpoints for the most efficient data extraction.

Remember to always respect websites' terms of service, implement proper rate limiting, and handle errors gracefully. For complex scenarios involving JavaScript-heavy websites with dynamic content loading, consider using browser automation tools that can handle sophisticated pagination mechanisms. When dealing with multiple browser contexts or sessions, understanding how to handle browser sessions in Puppeteer can provide additional insights for managing state across paginated requests.

The key to successful pagination scraping is understanding the specific implementation patterns of your target website and choosing the appropriate technique accordingly.

Table of contents