Scrapy automatically handles cookies and sessions, but you can customize this behavior for complex authentication flows, multiple sessions, or precise cookie control. This guide covers all the techniques you need.
Default Behavior
By default, Scrapy: - Uses a single cookie jar for all requests in a spider - Automatically stores and sends cookies between requests - Maintains session state throughout the crawling process
Cookie Jar Management
Multiple Cookie Jars
Use different cookie jars to simulate multiple browser sessions:
import scrapy
class MultiSessionSpider(scrapy.Spider):
name = 'multi_session'
def start_requests(self):
# User 1 session
yield scrapy.Request(
'https://example.com/login',
meta={'cookiejar': 'user1'},
callback=self.login_user1
)
# User 2 session
yield scrapy.Request(
'https://example.com/login',
meta={'cookiejar': 'user2'},
callback=self.login_user2
)
def login_user1(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'user1', 'password': 'pass1'},
meta={'cookiejar': 'user1'},
callback=self.scrape_user1_data
)
def scrape_user1_data(self, response):
# This request uses user1's session cookies
yield scrapy.Request(
'https://example.com/dashboard',
meta={'cookiejar': 'user1'},
callback=self.parse_dashboard
)
Cookie Jar Isolation
Each cookie jar maintains separate cookie storage:
# These requests use completely separate cookie sessions
yield scrapy.Request(url, meta={'cookiejar': 1}) # Session 1
yield scrapy.Request(url, meta={'cookiejar': 2}) # Session 2
yield scrapy.Request(url, meta={'cookiejar': 'admin'}) # Named session
Manual Cookie Control
Setting Custom Cookies
Pass cookies directly to requests:
def start_requests(self):
cookies = {
'sessionid': 'abc123',
'csrftoken': 'xyz789',
'user_pref': 'dark_mode'
}
yield scrapy.Request(
'https://example.com/protected',
cookies=cookies,
callback=self.parse
)
Disabling Cookie Merging
Prevent Scrapy from merging response cookies with existing ones:
def make_request(self):
return scrapy.Request(
'https://example.com/api',
cookies={'api_key': 'secret'},
meta={'dont_merge_cookies': True},
callback=self.parse_api_response
)
Disabling Cookies Entirely
Turn off cookies for specific requests:
yield scrapy.Request(
'https://example.com/static-content',
meta={'dont_merge_cookies': True},
# No cookies parameter = no cookies sent
callback=self.parse
)
Login and Authentication
Complete Login Flow Example
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login_spider'
def start_requests(self):
return [scrapy.Request(
'https://example.com/login',
callback=self.login
)]
def login(self, response):
# Extract CSRF token or other hidden fields
csrf_token = response.css('input[name="csrf_token"]::attr(value)').get()
return scrapy.FormRequest.from_response(
response,
formdata={
'username': 'your_username',
'password': 'your_password',
'csrf_token': csrf_token
},
callback=self.after_login
)
def after_login(self, response):
if "dashboard" in response.url:
# Login successful, continue scraping
yield scrapy.Request(
'https://example.com/protected-data',
callback=self.parse_data
)
else:
self.logger.error("Login failed")
def parse_data(self, response):
# Session cookies are automatically maintained
for item in response.css('.data-item'):
yield {
'title': item.css('.title::text').get(),
'value': item.css('.value::text').get()
}
Maintaining Session Across Requests
class SessionSpider(scrapy.Spider):
name = 'session_spider'
def start_requests(self):
# All requests will share the same cookie jar by default
yield scrapy.Request('https://example.com/step1', callback=self.step1)
def step1(self, response):
# Cookies from step1 response are automatically stored
yield scrapy.Request('https://example.com/step2', callback=self.step2)
def step2(self, response):
# Cookies from step1 are automatically sent with this request
yield scrapy.Request('https://example.com/step3', callback=self.step3)
Advanced Techniques
Custom Cookie Middleware
Create custom middleware for complex cookie handling:
# middlewares.py
class CustomCookieMiddleware:
def process_request(self, request, spider):
# Add custom logic for cookie handling
if 'special' in request.url:
request.cookies['special_token'] = 'value'
return None
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.CustomCookieMiddleware': 543,
}
Cookie Persistence
Save cookies to disk for reuse across spider runs:
import pickle
import os
class PersistentCookieSpider(scrapy.Spider):
name = 'persistent_cookies'
def start_requests(self):
# Load saved cookies
cookies = self.load_cookies()
yield scrapy.Request(
'https://example.com',
cookies=cookies,
callback=self.parse
)
def load_cookies(self):
cookie_file = 'cookies.pkl'
if os.path.exists(cookie_file):
with open(cookie_file, 'rb') as f:
return pickle.load(f)
return {}
def save_cookies(self, cookies):
with open('cookies.pkl', 'wb') as f:
pickle.dump(cookies, f)
Settings Configuration
Configure cookie behavior in settings.py:
# Disable cookies globally
COOKIES_ENABLED = False
# Enable cookie debugging
COOKIES_DEBUG = True
# Custom cookie jar processor
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
}
Troubleshooting
Debug Cookie Issues
Enable cookie debugging to see what's happening:
# In settings.py
COOKIES_DEBUG = True
LOG_LEVEL = 'DEBUG'
Common Pitfalls
- Forgetting cookiejar consistency: Always use the same cookiejar identifier for related requests
- CSRF token handling: Extract and include CSRF tokens in form submissions
- Cookie expiration: Some sites expire cookies quickly; handle re-authentication
- Domain restrictions: Cookies are domain-specific; ensure URLs match
This comprehensive approach to session and cookie management will handle most authentication scenarios in Scrapy, from simple logins to complex multi-user sessions.