How to Handle HTTP errors in Scrapy
Handling HTTP errors is essential for building robust web scrapers. Scrapy provides several mechanisms to handle HTTP errors gracefully, including error middlewares, callback functions, and retry logic.
Understanding HTTP Error Handling in Scrapy
By default, Scrapy drops responses with HTTP status codes outside the 200-299 range (successful responses). The HttpErrorMiddleware
intercepts these errors and can pass them to error handling functions.
1. Enable HttpErrorMiddleware
The HttpErrorMiddleware
is enabled by default in Scrapy. To explicitly configure it in your settings.py
:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httperror.HttpErrorMiddleware': 50,
}
2. Basic Error Handling with Errback
Use the errback
parameter in your requests to handle errors:
import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TimeoutError, TCPTimedOutError
class RobustSpider(scrapy.Spider):
name = 'robust_spider'
start_urls = [
'https://httpbin.org/status/404',
'https://httpbin.org/status/500',
'https://httpbin.org/status/200'
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse,
errback=self.handle_error,
dont_filter=True
)
def parse(self, response):
self.logger.info(f'Successfully scraped {response.url}')
# Process successful response
yield {'url': response.url, 'status': response.status}
def handle_error(self, failure):
self.logger.error(f'Request failed: {repr(failure)}')
if failure.check(HttpError):
response = failure.value.response
self.logger.error(f'HTTP {response.status} error on {response.url}')
# Handle specific HTTP errors
if response.status == 404:
self.logger.warning(f'Page not found: {response.url}')
elif response.status == 500:
self.logger.error(f'Server error: {response.url}')
elif response.status == 403:
self.logger.warning(f'Access forbidden: {response.url}')
elif failure.check(DNSLookupError):
self.logger.error(f'DNS lookup failed for {failure.request.url}')
elif failure.check(TimeoutError, TCPTimedOutError):
self.logger.error(f'Request timed out: {failure.request.url}')
3. Handling Specific HTTP Status Codes
You can configure which HTTP status codes to handle as errors:
class CustomErrorSpider(scrapy.Spider):
name = 'custom_error_spider'
# Handle 404 and 500 errors specifically
handle_httpstatus_list = [404, 500]
def start_requests(self):
urls = ['https://httpbin.org/status/404', 'https://httpbin.org/status/500']
for url in urls:
yield scrapy.Request(url, callback=self.parse_with_errors)
def parse_with_errors(self, response):
if response.status == 404:
self.logger.warning(f'Page not found: {response.url}')
# Handle 404 - maybe skip or use alternative URL
yield {'url': response.url, 'error': '404 Not Found'}
elif response.status == 500:
self.logger.error(f'Server error: {response.url}')
# Handle 500 - maybe retry later
yield {'url': response.url, 'error': '500 Server Error'}
else:
# Normal processing for successful responses
self.logger.info(f'Processing {response.url}')
yield {'url': response.url, 'status': 'success'}
4. Advanced Error Handling with Retry Logic
Implement custom retry logic for specific error types:
class RetrySpider(scrapy.Spider):
name = 'retry_spider'
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse,
errback=self.handle_error,
meta={'retry_count': 0}
)
def handle_error(self, failure):
request = failure.request
retry_count = request.meta.get('retry_count', 0)
max_retries = 3
if failure.check(HttpError):
response = failure.value.response
# Retry on server errors (5xx)
if 500 <= response.status < 600 and retry_count < max_retries:
self.logger.warning(f'Retrying {request.url} (attempt {retry_count + 1})')
new_request = request.copy()
new_request.meta['retry_count'] = retry_count + 1
yield new_request
else:
self.logger.error(f'Giving up on {request.url} after {retry_count} retries')
elif failure.check(TimeoutError, TCPTimedOutError):
if retry_count < max_retries:
self.logger.warning(f'Timeout, retrying {request.url}')
new_request = request.copy()
new_request.meta['retry_count'] = retry_count + 1
yield new_request
5. Custom Error Middleware
Create a custom middleware for centralized error handling:
# middlewares.py
import logging
from scrapy.downloadermiddlewares.httperror import HttpError
class CustomHttpErrorMiddleware:
def __init__(self):
self.logger = logging.getLogger(__name__)
def process_response(self, request, response, spider):
if response.status >= 400:
self.logger.error(f'HTTP {response.status} error on {request.url}')
# Log response body for debugging
if response.status == 404:
self.logger.debug(f'404 response body: {response.text[:200]}...')
return response
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.CustomHttpErrorMiddleware': 543,
}
6. Error Handling Best Practices
Log Errors Appropriately
def handle_error(self, failure):
# Use appropriate log levels
if failure.check(HttpError):
response = failure.value.response
if response.status == 404:
self.logger.warning(f'Page not found: {response.url}')
elif response.status >= 500:
self.logger.error(f'Server error {response.status}: {response.url}')
# Include request metadata in logs
request = failure.request
self.logger.error(f'Failed request: {request.url}, meta: {request.meta}')
Graceful Degradation
def handle_error(self, failure):
if failure.check(HttpError):
response = failure.value.response
# Try alternative approaches
if response.status == 403:
# Maybe the site requires different headers
new_request = failure.request.copy()
new_request.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; bot)',
'Accept': 'text/html,application/xhtml+xml'
})
yield new_request
Configuration Options
Configure error handling behavior in settings.py
:
# Retry failed requests
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
# Download delays to avoid overwhelming servers
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
# Timeout settings
DOWNLOAD_TIMEOUT = 180
Conclusion
Effective HTTP error handling in Scrapy involves:
- Using errback functions to catch and handle errors
- Implementing retry logic for transient errors
- Logging errors appropriately for debugging
- Handling specific status codes based on your needs
- Configuring middlewares for centralized error handling
By implementing these patterns, you can build robust spiders that gracefully handle network issues, server errors, and other HTTP-related problems while maintaining good performance and reliability.