Scraping login-protected websites with Python requires handling authentication, maintaining sessions, and often dealing with security measures like CSRF tokens. This guide covers the complete process using requests
and BeautifulSoup
.
Important: Always respect websites' terms of service and robots.txt
files. Verify that scraping is permitted before proceeding.
Prerequisites
Install the required Python libraries:
pip install requests beautifulsoup4 python-dotenv
Basic Login Scraping Workflow
1. Analyze the Login Process
Before coding, inspect the website's login mechanism:
- Open Developer Tools (F12) in your browser
- Navigate to the login page
- Open the Network tab
- Submit the login form
- Examine the POST request to identify:
- Login URL endpoint
- Required form fields (username, password, etc.)
- Additional parameters (CSRF tokens, hidden fields)
2. Simple Login Example
Here's a basic login scraper:
import requests
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
class LoginScraper:
def __init__(self):
self.session = requests.Session()
# Add common headers to mimic a real browser
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def login(self, login_url, username, password):
"""Perform login and return success status"""
try:
# Get login page to extract any hidden fields
login_page = self.session.get(login_url)
login_page.raise_for_status()
# Prepare login data
login_data = {
'username': username, # Adjust field names as needed
'password': password,
# Add other fields if required
}
# Submit login form
response = self.session.post(login_url, data=login_data)
response.raise_for_status()
# Check if login was successful
# Adjust this condition based on the website's behavior
if 'dashboard' in response.url or 'welcome' in response.text.lower():
print("Login successful!")
return True
else:
print("Login failed!")
return False
except requests.RequestException as e:
print(f"Login error: {e}")
return False
def scrape_protected_page(self, url):
"""Scrape content from a protected page"""
try:
response = self.session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
return soup
except requests.RequestException as e:
print(f"Scraping error: {e}")
return None
# Usage example
if __name__ == "__main__":
scraper = LoginScraper()
# Use environment variables for credentials
username = os.getenv('SCRAPER_USERNAME')
password = os.getenv('SCRAPER_PASSWORD')
if scraper.login('https://example.com/login', username, password):
# Scrape protected content
soup = scraper.scrape_protected_page('https://example.com/protected-page')
if soup:
# Extract data as needed
title = soup.find('h1')
print(f"Page title: {title.text if title else 'Not found'}")
3. Handling CSRF Tokens
Many websites use CSRF tokens for security. Here's how to handle them:
def login_with_csrf(self, login_url, username, password):
"""Login handling CSRF tokens"""
try:
# Get the login page
login_page = self.session.get(login_url)
login_page.raise_for_status()
# Parse the login form to extract CSRF token
soup = BeautifulSoup(login_page.text, 'html.parser')
# Find CSRF token (common patterns)
csrf_token = None
# Method 1: Look for hidden input with CSRF token
csrf_input = soup.find('input', {'name': 'csrf_token'}) or \
soup.find('input', {'name': '_token'}) or \
soup.find('input', {'name': 'authenticity_token'})
if csrf_input:
csrf_token = csrf_input.get('value')
# Method 2: Look for CSRF token in meta tags
if not csrf_token:
csrf_meta = soup.find('meta', {'name': 'csrf-token'})
if csrf_meta:
csrf_token = csrf_meta.get('content')
# Prepare login data with CSRF token
login_data = {
'username': username,
'password': password,
}
# Add CSRF token if found
if csrf_token:
login_data['csrf_token'] = csrf_token # Adjust field name as needed
print(f"Using CSRF token: {csrf_token[:10]}...")
# Submit login form
response = self.session.post(login_url, data=login_data)
response.raise_for_status()
# Verify login success
return self.verify_login_success(response)
except requests.RequestException as e:
print(f"CSRF login error: {e}")
return False
def verify_login_success(self, response):
"""Verify if login was successful"""
# Check multiple indicators
success_indicators = [
'dashboard' in response.url.lower(),
'profile' in response.url.lower(),
'logout' in response.text.lower(),
'welcome' in response.text.lower()
]
failure_indicators = [
'error' in response.text.lower(),
'invalid' in response.text.lower(),
'login' in response.url.lower()
]
if any(success_indicators) and not any(failure_indicators):
return True
return False
4. Advanced Authentication Patterns
For more complex authentication scenarios:
class AdvancedLoginScraper(LoginScraper):
def login_with_form_parsing(self, login_url, username, password):
"""Login by parsing the actual form structure"""
try:
# Get login page
login_page = self.session.get(login_url)
soup = BeautifulSoup(login_page.text, 'html.parser')
# Find the login form
form = soup.find('form', id='login-form') or \
soup.find('form', class_='login') or \
soup.find('form')
if not form:
print("Login form not found!")
return False
# Extract form action and method
action = form.get('action', login_url)
method = form.get('method', 'POST').upper()
# Build complete URL if action is relative
if action.startswith('/'):
from urllib.parse import urljoin
action = urljoin(login_url, action)
# Extract all form fields
form_data = {}
for input_field in form.find_all('input'):
name = input_field.get('name')
value = input_field.get('value', '')
input_type = input_field.get('type', 'text')
if name:
if input_type == 'password':
form_data[name] = password
elif name.lower() in ['username', 'email', 'user']:
form_data[name] = username
else:
form_data[name] = value
# Submit form
if method == 'POST':
response = self.session.post(action, data=form_data)
else:
response = self.session.get(action, params=form_data)
return self.verify_login_success(response)
except Exception as e:
print(f"Advanced login error: {e}")
return False
def handle_two_factor_auth(self, username, password, totp_code):
"""Handle two-factor authentication"""
# This is a simplified example - adapt based on specific implementation
if self.login(login_url, username, password):
# Look for 2FA prompt
response = self.session.get('https://example.com/2fa')
if '2fa' in response.url or 'two-factor' in response.text.lower():
# Submit 2FA code
tfa_data = {'code': totp_code}
response = self.session.post('https://example.com/2fa/verify', data=tfa_data)
return self.verify_login_success(response)
return False
Complete Example: Scraping a Protected Dashboard
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin
class WebScrapingBot:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def login_and_scrape(self, base_url, username, password):
"""Complete login and scraping workflow"""
login_url = urljoin(base_url, '/login')
try:
# Step 1: Get login page and extract form data
print("Getting login page...")
login_page = self.session.get(login_url)
soup = BeautifulSoup(login_page.text, 'html.parser')
# Step 2: Extract CSRF token if present
csrf_token = self.extract_csrf_token(soup)
# Step 3: Prepare and submit login data
login_data = {
'username': username,
'password': password
}
if csrf_token:
login_data['_token'] = csrf_token
print("Submitting login...")
response = self.session.post(login_url, data=login_data)
# Step 4: Verify login success
if not self.verify_login_success(response):
return None
# Step 5: Scrape protected content
print("Scraping protected content...")
protected_data = self.scrape_dashboard(base_url)
return protected_data
except Exception as e:
print(f"Error during login and scraping: {e}")
return None
def extract_csrf_token(self, soup):
"""Extract CSRF token from various possible locations"""
# Try different common patterns
selectors = [
'input[name="csrf_token"]',
'input[name="_token"]',
'meta[name="csrf-token"]',
'input[name="authenticity_token"]'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get('value') or element.get('content')
return None
def scrape_dashboard(self, base_url):
"""Scrape data from protected dashboard"""
dashboard_url = urljoin(base_url, '/dashboard')
try:
response = self.session.get(dashboard_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract relevant data
data = {
'title': soup.find('h1').text if soup.find('h1') else 'No title',
'stats': [],
'notifications': []
}
# Extract statistics
for stat in soup.find_all('div', class_='stat-card'):
title = stat.find('h3')
value = stat.find('span', class_='value')
if title and value:
data['stats'].append({
'title': title.text.strip(),
'value': value.text.strip()
})
# Extract notifications
for notification in soup.find_all('div', class_='notification'):
data['notifications'].append(notification.text.strip())
return data
except Exception as e:
print(f"Dashboard scraping error: {e}")
return None
def verify_login_success(self, response):
"""Check if login was successful"""
success_indicators = [
'dashboard' in response.url,
'profile' in response.url,
'logout' in response.text
]
return any(success_indicators)
# Usage
if __name__ == "__main__":
bot = WebScrapingBot()
# Use environment variables for security
username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')
if username and password:
data = bot.login_and_scrape('https://example.com', username, password)
if data:
print("Scraped data:", data)
else:
print("Failed to scrape data")
else:
print("Please set USERNAME and PASSWORD environment variables")
Security Best Practices
Environment Variables
Create a .env
file for credentials:
# .env file
USERNAME=your_username
PASSWORD=your_secure_password
Rate Limiting and Politeness
import time
import random
class PoliteScraper:
def __init__(self, delay_range=(1, 3)):
self.delay_range = delay_range
self.session = requests.Session()
def polite_request(self, url, **kwargs):
"""Make requests with random delays"""
# Random delay between requests
delay = random.uniform(*self.delay_range)
time.sleep(delay)
try:
response = self.session.get(url, **kwargs)
response.raise_for_status()
return response
except requests.RequestException as e:
print(f"Request failed: {e}")
return None
Alternative Approaches
Using Selenium for JavaScript-Heavy Sites
For sites with complex JavaScript authentication:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def selenium_login(driver, login_url, username, password):
"""Login using Selenium for JavaScript-heavy sites"""
driver.get(login_url)
# Wait for login form to load
wait = WebDriverWait(driver, 10)
# Find and fill login fields
username_field = wait.until(EC.presence_of_element_located((By.NAME, "username")))
password_field = driver.find_element(By.NAME, "password")
username_field.send_keys(username)
password_field.send_keys(password)
# Submit form
submit_button = driver.find_element(By.TYPE, "submit")
submit_button.click()
# Wait for redirect or success indicator
wait.until(EC.url_contains("dashboard"))
return True
Troubleshooting Common Issues
1. Login Fails Despite Correct Credentials
- Check for additional required fields (CSRF tokens, hidden inputs)
- Verify the correct form action URL
- Ensure proper headers are set
- Check if the site requires specific cookies from the initial page load
2. Session Expires Quickly
- Implement session refresh logic
- Check for session timeout warnings
- Store and reuse authentication tokens
3. Getting Blocked or Rate Limited
- Add realistic delays between requests
- Rotate User-Agent strings
- Use proxy rotation if necessary (for legitimate use cases)
- Respect the site's rate limits
Legal and Ethical Considerations
- Always check the website's
robots.txt
and terms of service - Respect rate limits and don't overload servers
- Store credentials securely using environment variables
- Be transparent about your scraping activities when possible
- Consider reaching out to website owners for permission or API access
Remember that web scraping should be done responsibly and ethically, always respecting the website's resources and terms of use.