How to Handle Timeouts in Scrapy
Scrapy provides several mechanisms to handle timeouts effectively. Understanding these options helps you build robust spiders that can handle slow or unresponsive websites gracefully.
The DOWNLOAD_TIMEOUT Setting
The primary timeout control in Scrapy is the DOWNLOAD_TIMEOUT
setting, which defines how long Scrapy waits for a response before considering the request failed. The default value is 180 seconds (3 minutes).
Global Timeout Configuration
Set timeout globally in your project's settings.py
:
# settings.py
DOWNLOAD_TIMEOUT = 60 # 60 seconds timeout for all requests
Per-Spider Timeout Configuration
Override timeout for specific spiders:
import scrapy
class MySpider(scrapy.Spider):
name = 'my_spider'
download_timeout = 30 # 30 seconds for this spider only
def start_requests(self):
urls = ['https://example.com/page1', 'https://example.com/page2']
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# Parse response here
pass
Per-Request Timeout Configuration
Set timeout for individual requests using the meta
parameter:
import scrapy
class FlexibleSpider(scrapy.Spider):
name = 'flexible_spider'
def start_requests(self):
# Fast endpoint - short timeout
yield scrapy.Request(
'https://fast-api.example.com/data',
callback=self.parse,
meta={'download_timeout': 10}
)
# Slow endpoint - longer timeout
yield scrapy.Request(
'https://slow-api.example.com/data',
callback=self.parse,
meta={'download_timeout': 120}
)
def parse(self, response):
# Extract data from both fast and slow endpoints
pass
Handling Timeout Errors
When a timeout occurs, Scrapy calls your request's errback
function with a twisted.internet.error.TimeoutError
exception.
Basic Error Handling
import scrapy
from twisted.internet.error import TimeoutError
class RobustSpider(scrapy.Spider):
name = 'robust_spider'
def start_requests(self):
urls = ['https://example.com/page1', 'https://example.com/page2']
for url in urls:
yield scrapy.Request(
url,
callback=self.parse,
errback=self.handle_error,
meta={'download_timeout': 30}
)
def parse(self, response):
# Extract data from successful responses
for item in response.css('.item'):
yield {
'title': item.css('.title::text').get(),
'url': response.url
}
def handle_error(self, failure):
request = failure.request
if failure.check(TimeoutError):
self.logger.warning(f'Timeout error for {request.url}')
# Optionally retry with longer timeout
yield scrapy.Request(
request.url,
callback=self.parse,
errback=self.handle_error,
meta={'download_timeout': 60},
dont_filter=True
)
else:
self.logger.error(f'Other error for {request.url}: {failure.value}')
Advanced Error Handling with Retry Logic
import scrapy
from twisted.internet.error import TimeoutError
class RetrySpider(scrapy.Spider):
name = 'retry_spider'
def start_requests(self):
urls = ['https://example.com/api/data']
for url in urls:
yield scrapy.Request(
url,
callback=self.parse,
errback=self.handle_error,
meta={
'download_timeout': 30,
'retry_count': 0,
'max_retries': 3
}
)
def parse(self, response):
# Process successful response
data = response.json()
for item in data.get('items', []):
yield item
def handle_error(self, failure):
request = failure.request
retry_count = request.meta.get('retry_count', 0)
max_retries = request.meta.get('max_retries', 3)
if failure.check(TimeoutError) and retry_count < max_retries:
self.logger.warning(
f'Timeout for {request.url} (attempt {retry_count + 1})'
)
# Increase timeout for retry
new_timeout = 30 * (retry_count + 2) # 60, 90, 120 seconds
yield scrapy.Request(
request.url,
callback=self.parse,
errback=self.handle_error,
meta={
'download_timeout': new_timeout,
'retry_count': retry_count + 1,
'max_retries': max_retries
},
dont_filter=True
)
else:
self.logger.error(f'Final failure for {request.url}: {failure.value}')
Important Timeout Behavior
First Byte vs. Complete Response
The DOWNLOAD_TIMEOUT
setting applies to the complete response download, not just the initial connection. It starts timing after the first byte is received from the server.
Connection vs. Download Timeouts
For connection-specific timeouts, you can use the DOWNLOAD_DELAY
setting in combination with custom middleware or the reactor.connectTCP
timeout parameters.
Related Timeout Settings
DOWNLOAD_DELAY
Controls the delay between requests to the same website:
# settings.py
DOWNLOAD_DELAY = 1 # 1 second delay between requests
RANDOMIZE_DOWNLOAD_DELAY = True # Randomize delay (0.5-1.5 seconds)
AutoThrottle Extension
Automatically adjusts delays based on response times:
# settings.py
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
Best Practices
- Set reasonable timeouts: Too short causes unnecessary failures; too long wastes resources
- Use different timeouts for different endpoints: Fast APIs vs. slow file downloads
- Implement proper error handling: Always use errbacks for timeout scenarios
- Consider retry logic: Some timeouts are temporary network issues
- Monitor timeout patterns: Log timeout frequencies to identify problematic sites
- Use AutoThrottle: Let Scrapy automatically adjust request timing
Example: Complete Spider with Timeout Handling
import scrapy
from twisted.internet.error import TimeoutError
class ProductSpider(scrapy.Spider):
name = 'product_spider'
download_timeout = 45 # Default timeout for this spider
custom_settings = {
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_MAX_DELAY': 10,
}
def start_requests(self):
urls = [
'https://example-store.com/products',
'https://slow-store.com/products',
]
for url in urls:
# Adjust timeout based on known site speed
timeout = 120 if 'slow-store' in url else 30
yield scrapy.Request(
url,
callback=self.parse,
errback=self.handle_error,
meta={'download_timeout': timeout}
)
def parse(self, response):
# Extract product URLs
product_urls = response.css('.product-link::attr(href)').getall()
for url in product_urls:
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_product,
errback=self.handle_error,
meta={'download_timeout': 60}
)
def parse_product(self, response):
yield {
'name': response.css('h1::text').get(),
'price': response.css('.price::text').get(),
'url': response.url
}
def handle_error(self, failure):
if failure.check(TimeoutError):
self.logger.warning(f'Timeout: {failure.request.url}')
else:
self.logger.error(f'Error: {failure.request.url} - {failure.value}')
This comprehensive approach to handling timeouts ensures your Scrapy spiders are robust and can handle various network conditions effectively.