Scrapy provides comprehensive cookie support through Python's http.cookiejar
module, automatically managing cookies for session handling and authentication. This guide covers both automatic and manual cookie management approaches.
Default Cookie Handling
Scrapy's built-in cookie middleware automatically handles cookies by default. The CookiesMiddleware
manages cookie storage and transmission across requests within the same spider.
# settings.py - Default configuration
COOKIES_ENABLED = True # Enabled by default
COOKIES_DEBUG = False # Set to True for cookie debugging
Basic Spider with Automatic Cookies
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login_spider'
start_urls = ['https://example.com/login']
def parse(self, response):
# Cookies from this response are automatically stored
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'user', 'password': 'pass'},
callback=self.parse_dashboard
)
def parse_dashboard(self, response):
# Authentication cookies are automatically sent
yield response.follow('/protected-page', callback=self.parse_protected)
Manual Cookie Management
For precise control over cookies, you can manage them manually by passing cookies directly to requests.
Sending Custom Cookies
import scrapy
class CustomCookieSpider(scrapy.Spider):
name = 'custom_cookie_spider'
def start_requests(self):
cookies = {
'sessionid': 'abc123def456',
'csrftoken': 'xyz789',
'user_preferences': 'theme=dark;lang=en'
}
yield scrapy.Request(
url='https://example.com/api/data',
cookies=cookies,
callback=self.parse
)
def parse(self, response):
# Process response with custom cookies
yield {'data': response.json()}
Disabling Automatic Cookie Handling
# settings.py - Disable automatic cookies
COOKIES_ENABLED = False
# Or disable per request
def start_requests(self):
yield scrapy.Request(
url='https://example.com',
meta={'dont_merge_cookies': True},
cookies={'custom': 'value'},
callback=self.parse
)
Accessing and Processing Response Cookies
Extract cookies from server responses for inspection or manual handling.
Reading Set-Cookie Headers
def parse(self, response):
# Get all Set-Cookie headers
set_cookies = response.headers.getlist('Set-Cookie')
for cookie_bytes in set_cookies:
cookie_str = cookie_bytes.decode('utf-8')
self.logger.info(f'Received cookie: {cookie_str}')
# Parse cookie attributes
if 'sessionid=' in cookie_str:
session_id = cookie_str.split('sessionid=')[1].split(';')[0]
self.logger.info(f'Session ID: {session_id}')
Using Cookie Parsing Libraries
from http.cookies import SimpleCookie
def parse(self, response):
for cookie_header in response.headers.getlist('Set-Cookie'):
cookie = SimpleCookie()
cookie.load(cookie_header.decode('utf-8'))
for key, morsel in cookie.items():
yield {
'name': key,
'value': morsel.value,
'domain': morsel.get('domain'),
'path': morsel.get('path'),
'expires': morsel.get('expires'),
'secure': morsel.get('secure'),
'httponly': morsel.get('httponly')
}
Advanced Cookie Scenarios
Login Flow with Cookie Persistence
import scrapy
class LoginFlowSpider(scrapy.Spider):
name = 'login_flow'
start_urls = ['https://example.com/login']
def parse(self, response):
# Extract CSRF token from login form
csrf_token = response.css('input[name="csrfmiddlewaretoken"]::attr(value)').get()
return scrapy.FormRequest(
url='https://example.com/login',
formdata={
'username': 'your_username',
'password': 'your_password',
'csrfmiddlewaretoken': csrf_token
},
callback=self.after_login
)
def after_login(self, response):
# Check if login was successful
if "Welcome" in response.text:
# Cookies are automatically handled for subsequent requests
yield response.follow('/dashboard', callback=self.parse_dashboard)
else:
self.logger.error("Login failed")
def parse_dashboard(self, response):
# Session cookies are automatically sent
for link in response.css('a.data-link'):
yield response.follow(link, callback=self.parse_data)
Custom Cookie Middleware
# middlewares.py
class CustomCookieMiddleware:
def process_request(self, request, spider):
# Add custom headers or modify cookies
if hasattr(spider, 'custom_cookies'):
request.cookies.update(spider.custom_cookies)
return None
def process_response(self, request, response, spider):
# Log or process cookies from response
if response.headers.get('Set-Cookie'):
spider.logger.info(f'Cookies received from {request.url}')
return response
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.CustomCookieMiddleware': 650,
}
Cookie Debugging and Troubleshooting
Enable Cookie Debug Logging
# settings.py
COOKIES_DEBUG = True
LOG_LEVEL = 'DEBUG'
# This will log all cookie operations:
# - Cookies sent with requests
# - Cookies received from responses
# - Cookie jar contents
Inspecting Cookie Jar
def parse(self, response):
# Access the cookie jar (when using automatic cookies)
if hasattr(self.crawler.engine.downloader, 'middleware'):
cookie_mw = None
for mw in self.crawler.engine.downloader.middleware.middlewares:
if hasattr(mw, 'jars'):
cookie_mw = mw
break
if cookie_mw:
jar = cookie_mw.jars.get(response.meta.get('cookiejar'))
if jar:
for cookie in jar:
self.logger.info(f'Cookie in jar: {cookie.name}={cookie.value}')
Best Practices
- Use automatic cookie handling for most scenarios - it's reliable and handles edge cases
- Enable cookie debugging during development to understand cookie flow
- Respect cookie policies - check
robots.txt
and terms of service - Handle cookie expiration gracefully in long-running spiders
- Use session cookies for authentication flows
- Store sensitive cookies securely - avoid logging authentication tokens
Common Pitfalls
- Don't mix automatic and manual cookies without understanding the interaction
- Remember cookie scope - cookies are isolated per spider instance
- Handle cookie encoding properly when parsing Set-Cookie headers
- Be aware of cookie limits - browsers typically limit cookies per domain
Cookie handling in Scrapy is powerful and flexible. Start with the automatic approach and move to manual handling only when you need precise control over cookie behavior.