How do I implement custom downloader middlewares in Scrapy?
Custom downloader middlewares in Scrapy are powerful components that sit between the engine and the downloader, allowing you to process requests before they're sent and responses before they reach your spiders. They're essential for implementing features like proxy rotation, user-agent randomization, request filtering, and response processing.
Understanding Downloader Middlewares
Downloader middlewares are hooks that allow you to: - Modify requests before they're sent to websites - Process responses before they reach spiders - Handle request/response errors - Implement retry logic - Add authentication headers - Rotate proxies and user agents - Filter or block requests
Basic Middleware Structure
A downloader middleware is a Python class that implements specific methods. Here's the basic structure:
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.downloadermiddlewares.retry import RetryMiddleware
import random
import logging
class CustomDownloaderMiddleware:
def __init__(self, settings):
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your middleware
return cls(
settings=crawler.settings
)
def process_request(self, request, spider):
# Called for each request that goes through the downloader middleware
# Return None to continue processing
# Return Response object to stop processing and return response
# Return Request object to stop processing and reschedule request
return None
def process_response(self, request, response, spider):
# Called for each response that comes back through the middleware
# Must return Response object, Request object, or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when download handler or process_request raises exception
# Return None to continue processing
# Return Response object to stop processing and return response
# Return Request object to stop processing and reschedule request
pass
Implementing User-Agent Rotation Middleware
Here's a practical example of a middleware that rotates user agents:
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent=''):
self.user_agent = user_agent
self.user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
]
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENT')
)
def process_request(self, request, spider):
user_agent = random.choice(self.user_agent_list)
request.headers['User-Agent'] = user_agent
spider.logger.debug(f'User-Agent set to: {user_agent}')
return None
Proxy Rotation Middleware
A middleware for rotating proxies to avoid IP blocking:
import random
import logging
from scrapy.exceptions import NotConfigured
class ProxyRotationMiddleware:
def __init__(self, proxy_list):
self.proxy_list = proxy_list
@classmethod
def from_crawler(cls, crawler):
proxy_list = crawler.settings.get('PROXY_LIST')
if not proxy_list:
raise NotConfigured('PROXY_LIST setting is required')
return cls(proxy_list)
def process_request(self, request, spider):
proxy = random.choice(self.proxy_list)
request.meta['proxy'] = proxy
spider.logger.debug(f'Using proxy: {proxy}')
return None
def process_exception(self, request, exception, spider):
# Remove failed proxy and retry with different one
if 'proxy' in request.meta:
failed_proxy = request.meta['proxy']
if failed_proxy in self.proxy_list:
self.proxy_list.remove(failed_proxy)
spider.logger.warning(f'Removed failed proxy: {failed_proxy}')
if self.proxy_list:
# Retry with different proxy
new_request = request.copy()
new_request.meta['proxy'] = random.choice(self.proxy_list)
return new_request
return None
Request Filtering Middleware
Filter requests based on custom criteria:
import re
from scrapy.exceptions import IgnoreRequest
class RequestFilterMiddleware:
def __init__(self, forbidden_extensions=None, allowed_domains=None):
self.forbidden_extensions = forbidden_extensions or []
self.allowed_domains = allowed_domains or []
@classmethod
def from_crawler(cls, crawler):
return cls(
forbidden_extensions=crawler.settings.get('FORBIDDEN_EXTENSIONS', []),
allowed_domains=crawler.settings.get('ALLOWED_DOMAINS', [])
)
def process_request(self, request, spider):
# Filter by file extension
for ext in self.forbidden_extensions:
if request.url.endswith(ext):
spider.logger.debug(f'Ignoring request to {request.url} (forbidden extension)')
raise IgnoreRequest(f'Forbidden extension: {ext}')
# Filter by domain
if self.allowed_domains:
domain_allowed = any(domain in request.url for domain in self.allowed_domains)
if not domain_allowed:
spider.logger.debug(f'Ignoring request to {request.url} (domain not allowed)')
raise IgnoreRequest('Domain not in allowed list')
return None
Custom Authentication Middleware
Add authentication headers to requests:
import base64
from scrapy.http import Request
class AuthenticationMiddleware:
def __init__(self, username=None, password=None, api_key=None):
self.username = username
self.password = password
self.api_key = api_key
@classmethod
def from_crawler(cls, crawler):
return cls(
username=crawler.settings.get('AUTH_USERNAME'),
password=crawler.settings.get('AUTH_PASSWORD'),
api_key=crawler.settings.get('API_KEY')
)
def process_request(self, request, spider):
# Basic Authentication
if self.username and self.password:
credentials = f"{self.username}:{self.password}"
encoded_credentials = base64.b64encode(credentials.encode()).decode()
request.headers['Authorization'] = f'Basic {encoded_credentials}'
# API Key Authentication
elif self.api_key:
request.headers['X-API-Key'] = self.api_key
return None
Response Processing Middleware
Process responses before they reach spiders:
from scrapy.http import HtmlResponse
import gzip
import json
class ResponseProcessingMiddleware:
def process_response(self, request, response, spider):
# Handle compressed responses
if response.headers.get('Content-Encoding') == b'gzip':
try:
body = gzip.decompress(response.body)
response = response.replace(body=body)
except Exception as e:
spider.logger.error(f'Failed to decompress gzip response: {e}')
# Convert JSON responses to HtmlResponse for easier parsing
if 'application/json' in response.headers.get('Content-Type', b'').decode():
try:
data = json.loads(response.text)
# Create HTML representation of JSON for easier parsing
html_body = f"<html><body><pre>{json.dumps(data, indent=2)}</pre></body></html>"
response = HtmlResponse(
url=response.url,
body=html_body.encode(),
encoding='utf-8'
)
except Exception as e:
spider.logger.error(f'Failed to process JSON response: {e}')
return response
Configuring Middlewares in settings.py
Add your custom middlewares to the settings.py
file:
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RotateUserAgentMiddleware': 400,
'myproject.middlewares.ProxyRotationMiddleware': 410,
'myproject.middlewares.RequestFilterMiddleware': 420,
'myproject.middlewares.AuthenticationMiddleware': 430,
'myproject.middlewares.ResponseProcessingMiddleware': 440,
# Disable default middlewares if needed
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
# Custom settings for middlewares
PROXY_LIST = [
'http://proxy1.example.com:8080',
'http://proxy2.example.com:8080',
'http://proxy3.example.com:8080',
]
FORBIDDEN_EXTENSIONS = ['.pdf', '.zip', '.exe', '.dmg']
ALLOWED_DOMAINS = ['example.com', 'api.example.com']
AUTH_USERNAME = 'your_username'
AUTH_PASSWORD = 'your_password'
API_KEY = 'your_api_key'
Advanced Middleware Patterns
Conditional Middleware Activation
class ConditionalMiddleware:
def __init__(self, enabled_spiders=None):
self.enabled_spiders = enabled_spiders or []
@classmethod
def from_crawler(cls, crawler):
return cls(
enabled_spiders=crawler.settings.get('CONDITIONAL_MIDDLEWARE_SPIDERS', [])
)
def process_request(self, request, spider):
if spider.name not in self.enabled_spiders:
return None
# Apply middleware logic only for specified spiders
# ... middleware logic here
return None
Middleware with Statistics
from scrapy.statscollectors import StatsCollector
class StatisticsMiddleware:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def process_request(self, request, spider):
self.stats.inc_value('custom_middleware/requests_processed')
return None
def process_response(self, request, response, spider):
self.stats.inc_value(f'custom_middleware/responses_{response.status}')
return response
Testing Custom Middlewares
Create unit tests for your middlewares:
import unittest
from scrapy.http import Request, Response
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
from myproject.middlewares import RotateUserAgentMiddleware
class TestRotateUserAgentMiddleware(unittest.TestCase):
def setUp(self):
crawler = get_crawler()
self.middleware = RotateUserAgentMiddleware.from_crawler(crawler)
self.spider = Spider('test')
def test_user_agent_rotation(self):
request = Request('http://example.com')
# Process request
result = self.middleware.process_request(request, self.spider)
# Check that user agent was set
self.assertIsNone(result)
self.assertIn('User-Agent', request.headers)
self.assertIn(request.headers['User-Agent'].decode(),
self.middleware.user_agent_list)
Using Middlewares with JavaScript Rendering
When scraping JavaScript-heavy sites, you can combine custom middlewares with Scrapy and Selenium WebDriver for enhanced functionality:
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
class JavaScriptMiddleware:
def process_request(self, request, spider):
if hasattr(request, 'wait_for_selector'):
# Convert to SeleniumRequest for JavaScript rendering
return SeleniumRequest(
url=request.url,
callback=request.callback,
wait_for_selector=request.wait_for_selector,
screenshot=True
)
return None
Best Practices
- Error Handling: Always implement proper error handling in your middlewares
- Logging: Use appropriate log levels for debugging and monitoring
- Performance: Avoid heavy computations in middleware methods
- Configuration: Make middlewares configurable through settings
- Testing: Write comprehensive tests for your custom middlewares
- Documentation: Document your middleware's purpose and configuration options
Custom downloader middlewares provide powerful capabilities for request and response processing in Scrapy. While Scrapy offers robust built-in functionality, implementing custom retry logic in Scrapy or rate limiting in Scrapy through middlewares gives you fine-grained control over your scraping pipeline.
By following these patterns and best practices, you can create robust, maintainable middlewares that enhance your Scrapy projects' capabilities and reliability.