How do I handle HTTP errors in Scrapy?

How to Handle HTTP errors in Scrapy

Handling HTTP errors is essential for building robust web scrapers. Scrapy provides several mechanisms to handle HTTP errors gracefully, including error middlewares, callback functions, and retry logic.

Understanding HTTP Error Handling in Scrapy

By default, Scrapy drops responses with HTTP status codes outside the 200-299 range (successful responses). The HttpErrorMiddleware intercepts these errors and can pass them to error handling functions.

1. Enable HttpErrorMiddleware

The HttpErrorMiddleware is enabled by default in Scrapy. To explicitly configure it in your settings.py:

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.httperror.HttpErrorMiddleware': 50,
}

2. Basic Error Handling with Errback

Use the errback parameter in your requests to handle errors:

import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TimeoutError, TCPTimedOutError

class RobustSpider(scrapy.Spider):
    name = 'robust_spider'
    start_urls = [
        'https://httpbin.org/status/404',
        'https://httpbin.org/status/500',
        'https://httpbin.org/status/200'
    ]

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                callback=self.parse,
                errback=self.handle_error,
                dont_filter=True
            )

    def parse(self, response):
        self.logger.info(f'Successfully scraped {response.url}')
        # Process successful response
        yield {'url': response.url, 'status': response.status}

    def handle_error(self, failure):
        self.logger.error(f'Request failed: {repr(failure)}')

        if failure.check(HttpError):
            response = failure.value.response
            self.logger.error(f'HTTP {response.status} error on {response.url}')

            # Handle specific HTTP errors
            if response.status == 404:
                self.logger.warning(f'Page not found: {response.url}')
            elif response.status == 500:
                self.logger.error(f'Server error: {response.url}')
            elif response.status == 403:
                self.logger.warning(f'Access forbidden: {response.url}')

        elif failure.check(DNSLookupError):
            self.logger.error(f'DNS lookup failed for {failure.request.url}')

        elif failure.check(TimeoutError, TCPTimedOutError):
            self.logger.error(f'Request timed out: {failure.request.url}')

3. Handling Specific HTTP Status Codes

You can configure which HTTP status codes to handle as errors:

class CustomErrorSpider(scrapy.Spider):
    name = 'custom_error_spider'

    # Handle 404 and 500 errors specifically
    handle_httpstatus_list = [404, 500]

    def start_requests(self):
        urls = ['https://httpbin.org/status/404', 'https://httpbin.org/status/500']
        for url in urls:
            yield scrapy.Request(url, callback=self.parse_with_errors)

    def parse_with_errors(self, response):
        if response.status == 404:
            self.logger.warning(f'Page not found: {response.url}')
            # Handle 404 - maybe skip or use alternative URL
            yield {'url': response.url, 'error': '404 Not Found'}

        elif response.status == 500:
            self.logger.error(f'Server error: {response.url}')
            # Handle 500 - maybe retry later
            yield {'url': response.url, 'error': '500 Server Error'}

        else:
            # Normal processing for successful responses
            self.logger.info(f'Processing {response.url}')
            yield {'url': response.url, 'status': 'success'}

4. Advanced Error Handling with Retry Logic

Implement custom retry logic for specific error types:

class RetrySpider(scrapy.Spider):
    name = 'retry_spider'

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                callback=self.parse,
                errback=self.handle_error,
                meta={'retry_count': 0}
            )

    def handle_error(self, failure):
        request = failure.request
        retry_count = request.meta.get('retry_count', 0)
        max_retries = 3

        if failure.check(HttpError):
            response = failure.value.response

            # Retry on server errors (5xx)
            if 500 <= response.status < 600 and retry_count < max_retries:
                self.logger.warning(f'Retrying {request.url} (attempt {retry_count + 1})')
                new_request = request.copy()
                new_request.meta['retry_count'] = retry_count + 1
                yield new_request
            else:
                self.logger.error(f'Giving up on {request.url} after {retry_count} retries')

        elif failure.check(TimeoutError, TCPTimedOutError):
            if retry_count < max_retries:
                self.logger.warning(f'Timeout, retrying {request.url}')
                new_request = request.copy()
                new_request.meta['retry_count'] = retry_count + 1
                yield new_request

5. Custom Error Middleware

Create a custom middleware for centralized error handling:

# middlewares.py
import logging
from scrapy.downloadermiddlewares.httperror import HttpError

class CustomHttpErrorMiddleware:
    def __init__(self):
        self.logger = logging.getLogger(__name__)

    def process_response(self, request, response, spider):
        if response.status >= 400:
            self.logger.error(f'HTTP {response.status} error on {request.url}')

            # Log response body for debugging
            if response.status == 404:
                self.logger.debug(f'404 response body: {response.text[:200]}...')

        return response

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'myproject.middlewares.CustomHttpErrorMiddleware': 543,
}

6. Error Handling Best Practices

Log Errors Appropriately

def handle_error(self, failure):
    # Use appropriate log levels
    if failure.check(HttpError):
        response = failure.value.response
        if response.status == 404:
            self.logger.warning(f'Page not found: {response.url}')
        elif response.status >= 500:
            self.logger.error(f'Server error {response.status}: {response.url}')

    # Include request metadata in logs
    request = failure.request
    self.logger.error(f'Failed request: {request.url}, meta: {request.meta}')

Graceful Degradation

def handle_error(self, failure):
    if failure.check(HttpError):
        response = failure.value.response

        # Try alternative approaches
        if response.status == 403:
            # Maybe the site requires different headers
            new_request = failure.request.copy()
            new_request.headers.update({
                'User-Agent': 'Mozilla/5.0 (compatible; bot)',
                'Accept': 'text/html,application/xhtml+xml'
            })
            yield new_request

Configuration Options

Configure error handling behavior in settings.py:

# Retry failed requests
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]

# Download delays to avoid overwhelming servers
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5

# Timeout settings
DOWNLOAD_TIMEOUT = 180

Conclusion

Effective HTTP error handling in Scrapy involves:

  1. Using errback functions to catch and handle errors
  2. Implementing retry logic for transient errors
  3. Logging errors appropriately for debugging
  4. Handling specific status codes based on your needs
  5. Configuring middlewares for centralized error handling

By implementing these patterns, you can build robust spiders that gracefully handle network issues, server errors, and other HTTP-related problems while maintaining good performance and reliability.

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon