How do I handle timeouts in Scrapy?

How to Handle Timeouts in Scrapy

Scrapy provides several mechanisms to handle timeouts effectively. Understanding these options helps you build robust spiders that can handle slow or unresponsive websites gracefully.

The DOWNLOAD_TIMEOUT Setting

The primary timeout control in Scrapy is the DOWNLOAD_TIMEOUT setting, which defines how long Scrapy waits for a response before considering the request failed. The default value is 180 seconds (3 minutes).

Global Timeout Configuration

Set timeout globally in your project's settings.py:

# settings.py
DOWNLOAD_TIMEOUT = 60  # 60 seconds timeout for all requests

Per-Spider Timeout Configuration

Override timeout for specific spiders:

import scrapy

class MySpider(scrapy.Spider):
    name = 'my_spider'
    download_timeout = 30  # 30 seconds for this spider only

    def start_requests(self):
        urls = ['https://example.com/page1', 'https://example.com/page2']
        for url in urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        # Parse response here
        pass

Per-Request Timeout Configuration

Set timeout for individual requests using the meta parameter:

import scrapy

class FlexibleSpider(scrapy.Spider):
    name = 'flexible_spider'

    def start_requests(self):
        # Fast endpoint - short timeout
        yield scrapy.Request(
            'https://fast-api.example.com/data',
            callback=self.parse,
            meta={'download_timeout': 10}
        )

        # Slow endpoint - longer timeout
        yield scrapy.Request(
            'https://slow-api.example.com/data',
            callback=self.parse,
            meta={'download_timeout': 120}
        )

    def parse(self, response):
        # Extract data from both fast and slow endpoints
        pass

Handling Timeout Errors

When a timeout occurs, Scrapy calls your request's errback function with a twisted.internet.error.TimeoutError exception.

Basic Error Handling

import scrapy
from twisted.internet.error import TimeoutError

class RobustSpider(scrapy.Spider):
    name = 'robust_spider'

    def start_requests(self):
        urls = ['https://example.com/page1', 'https://example.com/page2']
        for url in urls:
            yield scrapy.Request(
                url,
                callback=self.parse,
                errback=self.handle_error,
                meta={'download_timeout': 30}
            )

    def parse(self, response):
        # Extract data from successful responses
        for item in response.css('.item'):
            yield {
                'title': item.css('.title::text').get(),
                'url': response.url
            }

    def handle_error(self, failure):
        request = failure.request

        if failure.check(TimeoutError):
            self.logger.warning(f'Timeout error for {request.url}')
            # Optionally retry with longer timeout
            yield scrapy.Request(
                request.url,
                callback=self.parse,
                errback=self.handle_error,
                meta={'download_timeout': 60},
                dont_filter=True
            )
        else:
            self.logger.error(f'Other error for {request.url}: {failure.value}')

Advanced Error Handling with Retry Logic

import scrapy
from twisted.internet.error import TimeoutError

class RetrySpider(scrapy.Spider):
    name = 'retry_spider'

    def start_requests(self):
        urls = ['https://example.com/api/data']
        for url in urls:
            yield scrapy.Request(
                url,
                callback=self.parse,
                errback=self.handle_error,
                meta={
                    'download_timeout': 30,
                    'retry_count': 0,
                    'max_retries': 3
                }
            )

    def parse(self, response):
        # Process successful response
        data = response.json()
        for item in data.get('items', []):
            yield item

    def handle_error(self, failure):
        request = failure.request
        retry_count = request.meta.get('retry_count', 0)
        max_retries = request.meta.get('max_retries', 3)

        if failure.check(TimeoutError) and retry_count < max_retries:
            self.logger.warning(
                f'Timeout for {request.url} (attempt {retry_count + 1})'
            )

            # Increase timeout for retry
            new_timeout = 30 * (retry_count + 2)  # 60, 90, 120 seconds

            yield scrapy.Request(
                request.url,
                callback=self.parse,
                errback=self.handle_error,
                meta={
                    'download_timeout': new_timeout,
                    'retry_count': retry_count + 1,
                    'max_retries': max_retries
                },
                dont_filter=True
            )
        else:
            self.logger.error(f'Final failure for {request.url}: {failure.value}')

Important Timeout Behavior

First Byte vs. Complete Response

The DOWNLOAD_TIMEOUT setting applies to the complete response download, not just the initial connection. It starts timing after the first byte is received from the server.

Connection vs. Download Timeouts

For connection-specific timeouts, you can use the DOWNLOAD_DELAY setting in combination with custom middleware or the reactor.connectTCP timeout parameters.

Related Timeout Settings

DOWNLOAD_DELAY

Controls the delay between requests to the same website:

# settings.py
DOWNLOAD_DELAY = 1  # 1 second delay between requests
RANDOMIZE_DOWNLOAD_DELAY = True  # Randomize delay (0.5-1.5 seconds)

AutoThrottle Extension

Automatically adjusts delays based on response times:

# settings.py
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0

Best Practices

  1. Set reasonable timeouts: Too short causes unnecessary failures; too long wastes resources
  2. Use different timeouts for different endpoints: Fast APIs vs. slow file downloads
  3. Implement proper error handling: Always use errbacks for timeout scenarios
  4. Consider retry logic: Some timeouts are temporary network issues
  5. Monitor timeout patterns: Log timeout frequencies to identify problematic sites
  6. Use AutoThrottle: Let Scrapy automatically adjust request timing

Example: Complete Spider with Timeout Handling

import scrapy
from twisted.internet.error import TimeoutError

class ProductSpider(scrapy.Spider):
    name = 'product_spider'
    download_timeout = 45  # Default timeout for this spider

    custom_settings = {
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY': 10,
    }

    def start_requests(self):
        urls = [
            'https://example-store.com/products',
            'https://slow-store.com/products',
        ]

        for url in urls:
            # Adjust timeout based on known site speed
            timeout = 120 if 'slow-store' in url else 30

            yield scrapy.Request(
                url,
                callback=self.parse,
                errback=self.handle_error,
                meta={'download_timeout': timeout}
            )

    def parse(self, response):
        # Extract product URLs
        product_urls = response.css('.product-link::attr(href)').getall()

        for url in product_urls:
            yield scrapy.Request(
                response.urljoin(url),
                callback=self.parse_product,
                errback=self.handle_error,
                meta={'download_timeout': 60}
            )

    def parse_product(self, response):
        yield {
            'name': response.css('h1::text').get(),
            'price': response.css('.price::text').get(),
            'url': response.url
        }

    def handle_error(self, failure):
        if failure.check(TimeoutError):
            self.logger.warning(f'Timeout: {failure.request.url}')
        else:
            self.logger.error(f'Error: {failure.request.url} - {failure.value}')

This comprehensive approach to handling timeouts ensures your Scrapy spiders are robust and can handle various network conditions effectively.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon