How to Handle Redirects in Scrapy
Scrapy automatically handles HTTP redirects through its built-in RedirectMiddleware
. This comprehensive guide covers default behavior, configuration options, and custom redirect handling techniques.
Default Redirect Behavior
Scrapy follows HTTP 3xx redirects automatically with these default settings:
- REDIRECT_ENABLED:
True
(enables automatic redirect following) - REDIRECT_MAX_TIMES:
20
(maximum consecutive redirects allowed) - REDIRECT_PRIORITY_ADJUST:
+2
(priority adjustment for redirect requests)
Basic Configuration
Configure redirect behavior in your settings.py
:
# settings.py
REDIRECT_ENABLED = True
REDIRECT_MAX_TIMES = 10 # Reduce from default 20
REDIRECT_PRIORITY_ADJUST = 0 # No priority adjustment
Handling Specific Redirect Status Codes
To manually handle specific redirect codes instead of automatic following:
import scrapy
class RedirectSpider(scrapy.Spider):
name = 'redirect_spider'
handle_httpstatus_list = [301, 302, 303, 307, 308]
def start_requests(self):
urls = ['https://example.com/redirect-url']
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
if response.status in [301, 302, 303, 307, 308]:
# Handle redirect manually
redirect_url = response.headers.get('Location').decode('utf-8')
self.logger.info(f'Redirect from {response.url} to {redirect_url}')
# Follow redirect or handle differently
yield scrapy.Request(redirect_url, callback=self.parse_final)
else:
# Normal response processing
yield self.parse_final(response)
def parse_final(self, response):
self.logger.info(f'Final URL: {response.url}')
# Extract data from final page
yield {
'url': response.url,
'title': response.css('title::text').get(),
}
Tracking Redirect Chains
Access redirect history using the response.meta
dictionary:
def parse(self, response):
# Get redirect URLs that led to current response
redirect_urls = response.meta.get('redirect_urls', [])
if redirect_urls:
self.logger.info(f'Redirect chain: {" -> ".join(redirect_urls)} -> {response.url}')
# Check redirect reasons
redirect_reasons = response.meta.get('redirect_reasons', [])
self.logger.info(f'Redirect reasons: {redirect_reasons}')
yield {
'final_url': response.url,
'redirect_chain': redirect_urls,
'redirect_count': len(redirect_urls)
}
Custom Redirect Middleware
Create custom redirect handling by overriding the default middleware:
# middlewares.py
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
from scrapy.http import HtmlResponse
class CustomRedirectMiddleware(RedirectMiddleware):
def redirect_request_using_get(self, request, redirect_url):
"""Override to customize redirect request creation"""
redirected_request = super().redirect_request_using_get(request, redirect_url)
# Add custom headers or meta to redirect requests
redirected_request.meta['redirect_custom'] = True
return redirected_request
def process_response(self, request, response, spider):
# Custom logic for specific redirect scenarios
if response.status == 302 and 'special-redirect' in response.url:
# Handle special case redirects differently
return self._handle_special_redirect(request, response, spider)
return super().process_response(request, response, spider)
def _handle_special_redirect(self, request, response, spider):
# Custom redirect handling logic
redirect_url = response.headers.get('Location').decode('utf-8')
spider.logger.info(f'Special redirect handling: {redirect_url}')
return request.replace(url=redirect_url, dont_filter=True)
Enable the custom middleware in settings.py
:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
'myproject.middlewares.CustomRedirectMiddleware': 600,
}
Preventing Infinite Redirect Loops
Protect against redirect loops with proper configuration:
class SafeRedirectSpider(scrapy.Spider):
name = 'safe_redirect_spider'
custom_settings = {
'REDIRECT_MAX_TIMES': 5, # Lower limit
'DUPEFILTER_CLASS': 'scrapy.dupefilters.RFPDupeFilter',
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse,
meta={'max_redirect_times': 3}, # Per-request limit
dont_filter=False # Enable duplicate filtering
)
Conditional Redirect Following
Follow redirects based on specific conditions:
def parse(self, response):
# Only follow redirects to same domain
if response.status in [301, 302]:
redirect_url = response.headers.get('Location').decode('utf-8')
current_domain = urlparse(response.url).netloc
redirect_domain = urlparse(redirect_url).netloc
if current_domain == redirect_domain:
yield scrapy.Request(redirect_url, callback=self.parse)
else:
self.logger.warning(f'Skipping cross-domain redirect: {redirect_url}')
# Process current page
yield self.extract_data(response)
Common Redirect Scenarios
JavaScript Redirects
Handle client-side redirects using Splash or Selenium:
# Using scrapy-splash
yield scrapy.Request(
url,
self.parse,
meta={
'splash': {
'args': {'wait': 2, 'html': 1}
}
}
)
Form-Based Redirects
Handle POST request redirects:
def parse_form(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'user', 'password': 'pass'},
callback=self.after_login,
dont_filter=True # Allow redirect to previously visited URLs
)
Meta Refresh Redirects
Handle HTML meta refresh redirects:
import re
def parse(self, response):
# Check for meta refresh
meta_refresh = response.css('meta[http-equiv="refresh"]::attr(content)').get()
if meta_refresh:
match = re.search(r'url=(.+)', meta_refresh, re.IGNORECASE)
if match:
redirect_url = match.group(1).strip('"\'')
yield scrapy.Request(
response.urljoin(redirect_url),
callback=self.parse
)
Best Practices
- Set reasonable redirect limits to prevent infinite loops
- Log redirect chains for debugging and monitoring
- Handle cross-domain redirects carefully for security
- Use
dont_filter=True
when following redirects to previously visited URLs - Test redirect handling with various redirect types and scenarios
- Monitor redirect patterns to identify potential issues or changes in target sites
Debugging Redirect Issues
Enable detailed redirect logging:
# settings.py
LOG_LEVEL = 'DEBUG'
# Or in spider
import logging
logging.getLogger('scrapy.downloadermiddlewares.redirect').setLevel(logging.DEBUG)
This will show detailed information about redirect processing, helping identify issues with redirect handling in your spiders.