Error handling is essential for building robust Scrapy spiders that can handle network failures, server errors, and parsing issues gracefully. Scrapy provides multiple mechanisms to catch and handle different types of errors during the scraping process.
Common Types of Scrapy Errors
- HTTP errors: Server responses with 4xx/5xx status codes
- Network errors: Connection timeouts, DNS failures
- Parsing errors: Invalid HTML structure, missing elements
- Item pipeline errors: Data validation or storage failures
1. Handling HTTP Status Codes
By default, Scrapy filters out responses with HTTP error codes (4xx, 5xx). To handle specific error codes:
Handle Specific Status Codes
import scrapy
class MySpider(scrapy.Spider):
name = 'error_handler'
handle_httpstatus_list = [404, 403, 500, 503]
def parse(self, response):
if response.status == 404:
self.logger.warning(f"Page not found: {response.url}")
# Handle 404 - maybe retry or log for later
return
elif response.status == 403:
self.logger.error(f"Access forbidden: {response.url}")
# Handle rate limiting or authentication issues
return
elif response.status >= 500:
self.logger.error(f"Server error {response.status}: {response.url}")
# Server errors - might want to retry later
return
# Normal processing for 200 responses
yield from self.parse_content(response)
Global HTTP Error Handling in Settings
# In settings.py
HTTPERROR_ALLOWED_CODES = [404, 403, 500, 502, 503]
2. Exception Handling in Parse Methods
Wrap parsing logic in try-except blocks to handle extraction errors:
import scrapy
from scrapy.exceptions import DropItem
class RobustSpider(scrapy.Spider):
name = 'robust_spider'
def parse(self, response):
try:
# Extract data with potential failure points
title = response.css('h1::text').get()
if not title:
raise ValueError("Title not found")
price = response.css('.price::text').re_first(r'\d+\.\d+')
if not price:
raise ValueError("Price not found")
yield {
'title': title.strip(),
'price': float(price),
'url': response.url
}
except ValueError as e:
self.logger.warning(f"Data extraction error on {response.url}: {e}")
# You might want to yield a partial item or skip entirely
except Exception as e:
self.logger.error(f"Unexpected error parsing {response.url}: {e}")
# Log the full traceback for debugging
import traceback
self.logger.error(traceback.format_exc())
3. Handling Request Failures
Use errbacks to handle network-level failures:
import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TimeoutError
class NetworkErrorSpider(scrapy.Spider):
name = 'network_handler'
def start_requests(self):
urls = ['http://example.com', 'http://httpbin.org/status/500']
for url in urls:
yield scrapy.Request(
url,
callback=self.parse,
errback=self.handle_error,
dont_filter=True,
meta={'retry_times': 0}
)
def parse(self, response):
yield {'url': response.url, 'status': 'success'}
def handle_error(self, failure):
request = failure.request
if failure.check(HttpError):
response = failure.value.response
self.logger.error(f"HTTP error {response.status}: {request.url}")
elif failure.check(DNSLookupError):
self.logger.error(f"DNS lookup failed: {request.url}")
elif failure.check(TimeoutError):
self.logger.error(f"Request timeout: {request.url}")
# Retry with longer timeout
retry_times = request.meta.get('retry_times', 0)
if retry_times < 3:
yield request.replace(
meta={'retry_times': retry_times + 1},
dont_filter=True
)
else:
self.logger.error(f"Unknown error: {failure.value}")
4. Using Scrapy Signals for Global Error Handling
Connect to Scrapy signals to handle errors at the spider level:
import scrapy
from scrapy import signals
from scrapy.exceptions import NotConfigured
class SignalSpider(scrapy.Spider):
name = 'signal_spider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
# Connect error handling signals
crawler.signals.connect(spider.spider_error, signal=signals.spider_error)
crawler.signals.connect(spider.item_dropped, signal=signals.item_dropped)
return spider
def spider_error(self, failure, response, spider):
self.logger.error(f"Spider error: {failure.getErrorMessage()}")
# Send notification, update monitoring, etc.
def item_dropped(self, item, response, exception, spider):
self.logger.warning(f"Item dropped: {exception}")
# Handle dropped items - maybe store for manual review
5. Custom Exception Classes
Create custom exceptions for better error categorization:
class ScrapingError(Exception):
"""Base exception for scraping errors"""
pass
class DataExtractionError(ScrapingError):
"""Raised when required data cannot be extracted"""
pass
class RateLimitError(ScrapingError):
"""Raised when rate limiting is detected"""
pass
class MySpider(scrapy.Spider):
name = 'custom_exceptions'
def parse(self, response):
try:
if "rate limit" in response.text.lower():
raise RateLimitError("Rate limit detected")
title = response.css('h1::text').get()
if not title:
raise DataExtractionError("Title missing")
except RateLimitError:
self.logger.warning("Rate limited - sleeping")
# Implement backoff strategy
except DataExtractionError as e:
self.logger.error(f"Missing data: {e}")
# Handle missing data appropriately
6. Retry Middleware Configuration
Configure automatic retries for failed requests:
# In settings.py
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
# Exponential backoff
RETRY_BACKOFF_STRATEGY = 'scrapy.downloadermiddlewares.backoff.ExponentialBackoff'
Best Practices
- Log appropriate levels: Use
DEBUG
,INFO
,WARNING
,ERROR
appropriately - Include context: Always log the URL and relevant data when errors occur
- Graceful degradation: Continue processing other items when one fails
- Monitor error rates: Track error patterns to identify systematic issues
- Set reasonable timeouts: Configure
DOWNLOAD_TIMEOUT
to avoid hanging requests - Use Item Loaders: They provide built-in error handling for data processing
# Example of comprehensive error handling
class ProductSpider(scrapy.Spider):
name = 'products'
custom_settings = {
'DOWNLOAD_TIMEOUT': 30,
'RETRY_TIMES': 2,
'LOG_LEVEL': 'INFO'
}
def parse(self, response):
try:
products = response.css('.product')
if not products:
self.logger.warning(f"No products found on {response.url}")
return
for product in products:
try:
yield self.extract_product(product, response)
except Exception as e:
self.logger.error(f"Error extracting product: {e}")
continue # Skip this product, continue with others
except Exception as e:
self.logger.error(f"Page parsing failed for {response.url}: {e}")
def extract_product(self, selector, response):
# Product extraction logic with validation
name = selector.css('.name::text').get()
if not name:
raise ValueError("Product name missing")
return {
'name': name.strip(),
'url': response.url,
'extracted_at': scrapy.utils.misc.datetime.now()
}
Proper error handling makes your Scrapy spiders more reliable and easier to debug, especially when running long-term scraping operations.