How do I Handle Pagination in Scrapy?
Pagination is one of the most common challenges in web scraping, especially when dealing with websites that split content across multiple pages. Scrapy provides several powerful mechanisms to handle pagination efficiently, from simple link following to complex form-based navigation.
Understanding Pagination Types
Before diving into implementation, it's important to understand the different types of pagination you might encounter:
- Link-based pagination: Traditional "Next" buttons or numbered page links
- Form-based pagination: Pagination through form submissions
- JavaScript-based pagination: AJAX or infinite scroll (requires special handling)
- API-based pagination: RESTful APIs with offset/limit parameters
Method 1: Following Pagination Links
The most straightforward approach is following pagination links using Scrapy's follow()
method or by yielding new Request
objects.
Basic Link Following
import scrapy
class PaginationSpider(scrapy.Spider):
name = 'pagination_spider'
start_urls = ['https://example.com/products']
def parse(self, response):
# Extract data from current page
for product in response.css('.product-item'):
yield {
'name': product.css('.product-name::text').get(),
'price': product.css('.price::text').get(),
'description': product.css('.description::text').get(),
}
# Follow pagination link
next_page = response.css('.pagination .next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
Using XPath for Complex Pagination
def parse(self, response):
# Extract items
for item in response.xpath('//div[@class="item"]'):
yield {
'title': item.xpath('.//h2/text()').get(),
'content': item.xpath('.//p/text()').get(),
}
# Follow next page using XPath
next_page = response.xpath('//a[contains(text(), "Next")]/@href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
Method 2: Numbered Page Navigation
When dealing with numbered pagination, you can iterate through page numbers systematically:
class NumberedPaginationSpider(scrapy.Spider):
name = 'numbered_pagination'
start_urls = ['https://example.com/articles?page=1']
def parse(self, response):
# Extract data
for article in response.css('.article'):
yield {
'title': article.css('h2::text').get(),
'author': article.css('.author::text').get(),
'date': article.css('.date::text').get(),
}
# Check if there are more pages
current_page = response.meta.get('page', 1)
max_page = self.get_max_page(response)
if current_page < max_page:
next_page = current_page + 1
next_url = f'https://example.com/articles?page={next_page}'
yield scrapy.Request(
url=next_url,
callback=self.parse,
meta={'page': next_page}
)
def get_max_page(self, response):
# Extract maximum page number from pagination
last_page = response.css('.pagination a:last-child::text').get()
try:
return int(last_page)
except (ValueError, TypeError):
return 1
Method 3: Form-Based Pagination
Some websites use forms for pagination. Here's how to handle POST requests for pagination:
class FormPaginationSpider(scrapy.Spider):
name = 'form_pagination'
start_urls = ['https://example.com/search']
def parse(self, response):
# Extract data from current page
for result in response.css('.search-result'):
yield {
'title': result.css('.title::text').get(),
'url': result.css('a::attr(href)').get(),
}
# Handle form-based pagination
form_data = {
'page': str(response.meta.get('page', 1) + 1),
'query': 'your_search_term',
'csrf_token': response.css('input[name="csrf_token"]::attr(value)').get()
}
# Check if next page exists
if self.has_next_page(response):
yield scrapy.FormRequest.from_response(
response,
formdata=form_data,
callback=self.parse,
meta={'page': response.meta.get('page', 1) + 1}
)
def has_next_page(self, response):
# Check if "Next" button is disabled or doesn't exist
next_button = response.css('.pagination .next')
return next_button and not next_button.css('.disabled')
Method 4: Handling Infinite Scroll with Splash
For JavaScript-heavy pagination like infinite scroll, you'll need to use Scrapy-Splash or similar tools:
import scrapy
from scrapy_splash import SplashRequest
class InfiniteScrollSpider(scrapy.Spider):
name = 'infinite_scroll'
def start_requests(self):
splash_args = {
'html': 1,
'png': 1,
'width': 1024,
'height': 768,
'wait': 2,
'lua_source': '''
function main(splash, args)
splash:go(args.url)
splash:wait(2)
-- Scroll down to trigger loading
for i = 1, 5 do
splash:runjs("window.scrollTo(0, document.body.scrollHeight);")
splash:wait(2)
end
return splash:html()
end
'''
}
yield SplashRequest(
url='https://example.com/infinite-scroll',
callback=self.parse,
args=splash_args
)
def parse(self, response):
for item in response.css('.item'):
yield {
'title': item.css('.title::text').get(),
'content': item.css('.content::text').get(),
}
Advanced Pagination Techniques
Using URL Patterns
When you understand the URL pattern, you can generate requests programmatically:
class PatternPaginationSpider(scrapy.Spider):
name = 'pattern_pagination'
def start_requests(self):
base_url = 'https://api.example.com/data'
# Generate requests for multiple pages
for page in range(1, 101): # Pages 1-100
url = f'{base_url}?page={page}&limit=20'
yield scrapy.Request(
url=url,
callback=self.parse,
meta={'page': page}
)
def parse(self, response):
data = response.json()
# Process each item
for item in data.get('results', []):
yield {
'id': item.get('id'),
'name': item.get('name'),
'created_at': item.get('created_at'),
}
# Stop if no more data
if not data.get('results'):
self.logger.info(f"No more data on page {response.meta['page']}")
Handling Rate Limiting and Delays
When scraping paginated content, implement proper delays to avoid being blocked:
class RateLimitedPaginationSpider(scrapy.Spider):
name = 'rate_limited_pagination'
custom_settings = {
'DOWNLOAD_DELAY': 2,
'RANDOMIZE_DOWNLOAD_DELAY': 0.5,
'CONCURRENT_REQUESTS': 1,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
}
def parse(self, response):
# Your parsing logic here
for item in response.css('.item'):
yield {'data': item.css('::text').get()}
# Follow pagination with built-in delays
next_page = response.css('.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
Best Practices for Pagination
1. Implement Robust Error Handling
def parse(self, response):
try:
# Your parsing logic
items = response.css('.item')
if not items:
self.logger.warning(f"No items found on {response.url}")
return
for item in items:
yield self.extract_item(item)
# Safe pagination
next_url = self.get_next_page_url(response)
if next_url:
yield response.follow(next_url, self.parse)
except Exception as e:
self.logger.error(f"Error parsing {response.url}: {e}")
2. Use Duplication Filters
class PaginationSpider(scrapy.Spider):
name = 'pagination_with_filter'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
}
def parse(self, response):
# Scrapy automatically filters duplicate URLs
for item in response.css('.item'):
yield {'url': item.css('a::attr(href)').get()}
next_page = response.css('.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
3. Monitor Progress
def parse(self, response):
page_num = response.meta.get('page', 1)
self.logger.info(f"Processing page {page_num}")
# Your extraction logic
items_count = len(response.css('.item'))
self.logger.info(f"Found {items_count} items on page {page_num}")
# Continue pagination
if items_count > 0: # Only continue if items were found
next_page = response.css('.next::attr(href)').get()
if next_page:
yield response.follow(
next_page,
self.parse,
meta={'page': page_num + 1}
)
Conclusion
Handling pagination in Scrapy requires understanding the website's pagination mechanism and choosing the appropriate method. Whether you're dealing with simple link-based pagination or complex JavaScript-driven infinite scroll, Scrapy provides the tools to handle these scenarios effectively.
The key is to: - Identify the pagination pattern - Implement proper error handling - Use appropriate delays to avoid rate limiting - Monitor your scraping progress - Test thoroughly with different page ranges
For more complex scenarios involving JavaScript-heavy pagination, consider integrating Scrapy with browser automation tools or using Scrapy-Splash for dynamic content rendering.