How do I handle cookies in Scrapy?

Scrapy provides comprehensive cookie support through Python's http.cookiejar module, automatically managing cookies for session handling and authentication. This guide covers both automatic and manual cookie management approaches.

Default Cookie Handling

Scrapy's built-in cookie middleware automatically handles cookies by default. The CookiesMiddleware manages cookie storage and transmission across requests within the same spider.

# settings.py - Default configuration
COOKIES_ENABLED = True  # Enabled by default
COOKIES_DEBUG = False   # Set to True for cookie debugging

Basic Spider with Automatic Cookies

import scrapy

class LoginSpider(scrapy.Spider):
    name = 'login_spider'
    start_urls = ['https://example.com/login']

    def parse(self, response):
        # Cookies from this response are automatically stored
        return scrapy.FormRequest.from_response(
            response,
            formdata={'username': 'user', 'password': 'pass'},
            callback=self.parse_dashboard
        )

    def parse_dashboard(self, response):
        # Authentication cookies are automatically sent
        yield response.follow('/protected-page', callback=self.parse_protected)

Manual Cookie Management

For precise control over cookies, you can manage them manually by passing cookies directly to requests.

Sending Custom Cookies

import scrapy

class CustomCookieSpider(scrapy.Spider):
    name = 'custom_cookie_spider'

    def start_requests(self):
        cookies = {
            'sessionid': 'abc123def456',
            'csrftoken': 'xyz789',
            'user_preferences': 'theme=dark;lang=en'
        }

        yield scrapy.Request(
            url='https://example.com/api/data',
            cookies=cookies,
            callback=self.parse
        )

    def parse(self, response):
        # Process response with custom cookies
        yield {'data': response.json()}

Disabling Automatic Cookie Handling

# settings.py - Disable automatic cookies
COOKIES_ENABLED = False

# Or disable per request
def start_requests(self):
    yield scrapy.Request(
        url='https://example.com',
        meta={'dont_merge_cookies': True},
        cookies={'custom': 'value'},
        callback=self.parse
    )

Accessing and Processing Response Cookies

Extract cookies from server responses for inspection or manual handling.

Reading Set-Cookie Headers

def parse(self, response):
    # Get all Set-Cookie headers
    set_cookies = response.headers.getlist('Set-Cookie')

    for cookie_bytes in set_cookies:
        cookie_str = cookie_bytes.decode('utf-8')
        self.logger.info(f'Received cookie: {cookie_str}')

        # Parse cookie attributes
        if 'sessionid=' in cookie_str:
            session_id = cookie_str.split('sessionid=')[1].split(';')[0]
            self.logger.info(f'Session ID: {session_id}')

Using Cookie Parsing Libraries

from http.cookies import SimpleCookie

def parse(self, response):
    for cookie_header in response.headers.getlist('Set-Cookie'):
        cookie = SimpleCookie()
        cookie.load(cookie_header.decode('utf-8'))

        for key, morsel in cookie.items():
            yield {
                'name': key,
                'value': morsel.value,
                'domain': morsel.get('domain'),
                'path': morsel.get('path'),
                'expires': morsel.get('expires'),
                'secure': morsel.get('secure'),
                'httponly': morsel.get('httponly')
            }

Advanced Cookie Scenarios

Login Flow with Cookie Persistence

import scrapy

class LoginFlowSpider(scrapy.Spider):
    name = 'login_flow'
    start_urls = ['https://example.com/login']

    def parse(self, response):
        # Extract CSRF token from login form
        csrf_token = response.css('input[name="csrfmiddlewaretoken"]::attr(value)').get()

        return scrapy.FormRequest(
            url='https://example.com/login',
            formdata={
                'username': 'your_username',
                'password': 'your_password',
                'csrfmiddlewaretoken': csrf_token
            },
            callback=self.after_login
        )

    def after_login(self, response):
        # Check if login was successful
        if "Welcome" in response.text:
            # Cookies are automatically handled for subsequent requests
            yield response.follow('/dashboard', callback=self.parse_dashboard)
        else:
            self.logger.error("Login failed")

    def parse_dashboard(self, response):
        # Session cookies are automatically sent
        for link in response.css('a.data-link'):
            yield response.follow(link, callback=self.parse_data)

Custom Cookie Middleware

# middlewares.py
class CustomCookieMiddleware:
    def process_request(self, request, spider):
        # Add custom headers or modify cookies
        if hasattr(spider, 'custom_cookies'):
            request.cookies.update(spider.custom_cookies)
        return None

    def process_response(self, request, response, spider):
        # Log or process cookies from response
        if response.headers.get('Set-Cookie'):
            spider.logger.info(f'Cookies received from {request.url}')
        return response

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'myproject.middlewares.CustomCookieMiddleware': 650,
}

Cookie Debugging and Troubleshooting

Enable Cookie Debug Logging

# settings.py
COOKIES_DEBUG = True
LOG_LEVEL = 'DEBUG'

# This will log all cookie operations:
# - Cookies sent with requests
# - Cookies received from responses  
# - Cookie jar contents

Inspecting Cookie Jar

def parse(self, response):
    # Access the cookie jar (when using automatic cookies)
    if hasattr(self.crawler.engine.downloader, 'middleware'):
        cookie_mw = None
        for mw in self.crawler.engine.downloader.middleware.middlewares:
            if hasattr(mw, 'jars'):
                cookie_mw = mw
                break

        if cookie_mw:
            jar = cookie_mw.jars.get(response.meta.get('cookiejar'))
            if jar:
                for cookie in jar:
                    self.logger.info(f'Cookie in jar: {cookie.name}={cookie.value}')

Best Practices

  1. Use automatic cookie handling for most scenarios - it's reliable and handles edge cases
  2. Enable cookie debugging during development to understand cookie flow
  3. Respect cookie policies - check robots.txt and terms of service
  4. Handle cookie expiration gracefully in long-running spiders
  5. Use session cookies for authentication flows
  6. Store sensitive cookies securely - avoid logging authentication tokens

Common Pitfalls

  • Don't mix automatic and manual cookies without understanding the interaction
  • Remember cookie scope - cookies are isolated per spider instance
  • Handle cookie encoding properly when parsing Set-Cookie headers
  • Be aware of cookie limits - browsers typically limit cookies per domain

Cookie handling in Scrapy is powerful and flexible. Start with the automatic approach and move to manual handling only when you need precise control over cookie behavior.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon