Handling cookies properly is crucial when web scraping with HTTParty, especially for websites that require session management or authentication. This guide covers multiple approaches to cookie handling in HTTParty.
Automatic Cookie Handling
HTTParty can automatically manage cookies when using class-based approaches:
require 'httparty'
class WebScraper
include HTTParty
base_uri 'https://example.com'
# Enable automatic cookie handling
cookies {}
end
# First request gets cookies automatically
response = WebScraper.get('/login')
# Subsequent requests will include cookies from previous responses
profile_response = WebScraper.get('/profile')
Manual Cookie Management
Basic Cookie Extraction and Usage
require 'httparty'
# Make initial request
response = HTTParty.get('https://httpbin.org/cookies/set/session/abc123')
# Extract cookies from response headers
cookies = {}
if response.headers['set-cookie']
response.headers['set-cookie'].each do |cookie_string|
cookie_string.split(';').first.split('=', 2).tap do |key, value|
cookies[key] = value if key && value
end
end
end
# Use cookies in subsequent requests
next_response = HTTParty.get('https://httpbin.org/cookies',
cookies: cookies)
puts next_response.body
Using HTTParty's Cookie Hash
require 'httparty'
# Create a cookie hash
cookies = HTTParty::CookieHash.new
# Method 1: Set cookies manually
cookies['session_id'] = 'abc123'
cookies['user_pref'] = 'dark_mode'
# Method 2: Add cookies from a response
response = HTTParty.get('https://httpbin.org/cookies/set/test/value')
response.headers['set-cookie']&.each do |cookie|
cookies.add_cookies(cookie)
end
# Use the cookies
authenticated_response = HTTParty.get('https://httpbin.org/cookies',
cookies: cookies)
Session-Based Cookie Handling
For complex scraping scenarios requiring persistent sessions:
require 'httparty'
class SessionScraper
include HTTParty
def initialize(base_url)
@base_url = base_url
@cookies = HTTParty::CookieHash.new
end
def login(username, password)
response = self.class.post("#{@base_url}/login", {
body: { username: username, password: password },
cookies: @cookies
})
# Update cookies from login response
update_cookies(response)
response
end
def get_protected_page(path)
response = self.class.get("#{@base_url}#{path}", cookies: @cookies)
update_cookies(response)
response
end
private
def update_cookies(response)
response.headers['set-cookie']&.each do |cookie|
@cookies.add_cookies(cookie)
end
end
end
# Usage
scraper = SessionScraper.new('https://example.com')
scraper.login('user', 'pass')
profile_data = scraper.get_protected_page('/profile')
Advanced Cookie Handling with http-cookie Gem
For more sophisticated cookie management:
require 'httparty'
require 'http-cookie'
# Create a cookie jar
jar = HTTP::CookieJar.new
# Make initial request
response = HTTParty.get('https://httpbin.org/cookies/set/advanced/example')
# Parse and store cookies
response.headers['set-cookie']&.each do |cookie_string|
jar.parse(cookie_string, response.request.last_uri)
end
# Convert cookies to header format
cookie_header = jar.cookies.map { |c| "#{c.name}=#{c.value}" }.join('; ')
# Use in subsequent requests
next_response = HTTParty.get('https://httpbin.org/cookies', {
headers: { 'Cookie' => cookie_header }
})
Handling Secure and HttpOnly Cookies
require 'httparty'
class SecureScraper
include HTTParty
# Enable SSL verification for secure cookies
default_options.update(verify: true)
def self.handle_secure_cookies(url)
response = get(url, {
# Follow redirects to handle secure cookie redirections
follow_redirects: true,
# Set secure headers
headers: {
'User-Agent' => 'Mozilla/5.0 (compatible; Ruby HTTParty)'
}
})
# Process secure cookies
secure_cookies = {}
response.headers['set-cookie']&.each do |cookie|
if cookie.include?('Secure') || cookie.include?('HttpOnly')
name, value = cookie.split('=', 2)
secure_cookies[name] = value.split(';').first
end
end
{ response: response, cookies: secure_cookies }
end
end
Common Cookie Scenarios
E-commerce Site Login
require 'httparty'
class EcommerceBot
include HTTParty
base_uri 'https://shop.example.com'
cookies {}
def login(email, password)
# Get login form (may set CSRF cookies)
login_page = self.class.get('/login')
# Submit login form
login_response = self.class.post('/login', {
body: {
email: email,
password: password,
authenticity_token: extract_csrf_token(login_page.body)
}
})
login_response.success?
end
def add_to_cart(product_id)
self.class.post('/cart/add', {
body: { product_id: product_id }
})
end
private
def extract_csrf_token(html)
html.match(/name="authenticity_token" value="([^"]+)"/)[1]
rescue
nil
end
end
API with JWT Cookies
require 'httparty'
class ApiClient
include HTTParty
base_uri 'https://api.example.com'
def initialize
@cookies = HTTParty::CookieHash.new
end
def authenticate(api_key)
response = self.class.post('/auth', {
body: { api_key: api_key }.to_json,
headers: { 'Content-Type' => 'application/json' },
cookies: @cookies
})
# JWT might be set as httpOnly cookie
update_cookies_from_response(response)
response.success?
end
def make_authenticated_request(endpoint)
self.class.get(endpoint, cookies: @cookies)
end
private
def update_cookies_from_response(response)
response.headers['set-cookie']&.each do |cookie|
@cookies.add_cookies(cookie)
end
end
end
Best Practices
- Always respect robots.txt and rate limits when scraping
- Use appropriate delays between requests to avoid being blocked
- Handle cookie expiration by checking response codes and re-authenticating
- Clear sensitive cookies after scraping sessions
- Use secure connections (HTTPS) when handling authentication cookies
Troubleshooting Cookie Issues
# Debug cookie handling
response = HTTParty.get('https://httpbin.org/cookies/set/debug/true')
puts "Response cookies:"
puts response.headers['set-cookie']
puts "Current cookie jar:"
puts response.request.options[:cookies]
# Check if cookies are being sent
test_response = HTTParty.get('https://httpbin.org/cookies',
cookies: response.request.options[:cookies])
puts "Cookies sent to server:"
puts JSON.parse(test_response.body)['cookies']
Remember to always comply with website terms of service and respect rate limits when web scraping. Consider using official APIs when available.