Scraping authenticated websites in Rust requires handling HTTP sessions, cookies, and authentication mechanisms. This guide covers the complete process from login to data extraction.
Prerequisites
Before scraping authenticated sites, you need to:
- Analyze the login mechanism - Inspect the login form to identify field names, action URLs, and authentication type
- Check for CSRF protection - Look for hidden tokens or anti-CSRF measures
- Understand session management - Determine how the site maintains user sessions
- Review terms of service - Ensure your scraping complies with the site's policies
Dependencies Setup
Add these dependencies to your Cargo.toml
:
[dependencies]
reqwest = { version = "0.11", features = ["json", "cookies"] }
scraper = "0.17"
tokio = { version = "1", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
url = "2.4"
anyhow = "1.0"
Basic Authentication Example
Here's a complete example demonstrating authenticated web scraping:
use anyhow::{Context, Result};
use reqwest::{Client, Response};
use scraper::{Html, Selector};
use std::collections::HashMap;
pub struct AuthenticatedScraper {
client: Client,
login_url: String,
base_url: String,
}
impl AuthenticatedScraper {
pub fn new(base_url: String, login_url: String) -> Result<Self> {
let client = Client::builder()
.cookie_store(true)
.user_agent("Mozilla/5.0 (compatible; RustScraper/1.0)")
.build()
.context("Failed to create HTTP client")?;
Ok(Self {
client,
login_url,
base_url,
})
}
pub async fn login(&self, username: &str, password: &str) -> Result<()> {
// First, get the login page to extract any CSRF tokens
let login_page = self.client
.get(&self.login_url)
.send()
.await
.context("Failed to fetch login page")?;
let login_html = login_page.text().await?;
let document = Html::parse_document(&login_html);
// Extract CSRF token if present
let mut form_data = HashMap::new();
form_data.insert("username", username);
form_data.insert("password", password);
// Look for common CSRF token field names
if let Some(csrf_token) = self.extract_csrf_token(&document) {
form_data.insert("_token", &csrf_token);
// Try other common names
form_data.insert("csrf_token", &csrf_token);
form_data.insert("authenticity_token", &csrf_token);
}
// Submit login form
let response = self.client
.post(&self.login_url)
.form(&form_data)
.send()
.await
.context("Failed to submit login form")?;
self.verify_login_success(&response).await
}
fn extract_csrf_token(&self, document: &Html) -> Option<String> {
let selectors = [
"input[name='_token']",
"input[name='csrf_token']",
"input[name='authenticity_token']",
"meta[name='csrf-token']",
];
for selector_str in &selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = document.select(&selector).next() {
if let Some(token) = element.value().attr("value")
.or_else(|| element.value().attr("content")) {
return Some(token.to_string());
}
}
}
}
None
}
async fn verify_login_success(&self, response: &Response) -> Result<()> {
let status = response.status();
let body = response.text().await?;
// Check for successful login indicators
if status.is_success() {
if body.contains("dashboard") || body.contains("logout") {
println!("Login successful!");
return Ok(());
}
}
// Check for error indicators
if body.contains("invalid") || body.contains("error") || body.contains("incorrect") {
anyhow::bail!("Login failed: Invalid credentials");
}
if status.is_redirection() {
println!("Login successful (redirected)!");
return Ok(());
}
anyhow::bail!("Login status unclear. Status: {}", status);
}
pub async fn scrape_protected_page(&self, url: &str) -> Result<Vec<String>> {
let response = self.client
.get(url)
.send()
.await
.context("Failed to fetch protected page")?;
let body = response.text().await?;
let document = Html::parse_document(&body);
// Extract data using CSS selectors
let content_selector = Selector::parse(".content, .post, article")
.context("Invalid CSS selector")?;
let mut extracted_data = Vec::new();
for element in document.select(&content_selector) {
let text = element.text().collect::<Vec<_>>().join(" ");
if !text.trim().is_empty() {
extracted_data.push(text.trim().to_string());
}
}
Ok(extracted_data)
}
}
#[tokio::main]
async fn main() -> Result<()> {
let scraper = AuthenticatedScraper::new(
"https://example.com".to_string(),
"https://example.com/login".to_string(),
)?;
// Login
scraper.login("your_username", "your_password").await?;
// Scrape protected content
let data = scraper.scrape_protected_page("https://example.com/protected").await?;
for item in data {
println!("Extracted: {}", item);
}
Ok(())
}
Advanced Techniques
Handling Different Authentication Types
JWT Token Authentication
use serde::{Deserialize, Serialize};
#[derive(Deserialize)]
struct LoginResponse {
token: String,
expires_in: u64,
}
impl AuthenticatedScraper {
pub async fn login_jwt(&self, username: &str, password: &str) -> Result<String> {
let login_data = serde_json::json!({
"username": username,
"password": password
});
let response: LoginResponse = self.client
.post(&self.login_url)
.json(&login_data)
.send()
.await?
.json()
.await?;
Ok(response.token)
}
pub async fn scrape_with_jwt(&self, url: &str, token: &str) -> Result<String> {
let response = self.client
.get(url)
.bearer_auth(token)
.send()
.await?;
Ok(response.text().await?)
}
}
Session-Based Authentication with Custom Headers
impl AuthenticatedScraper {
pub async fn scrape_with_headers(&self, url: &str) -> Result<String> {
let response = self.client
.get(url)
.header("X-Requested-With", "XMLHttpRequest")
.header("Referer", &self.base_url)
.send()
.await?;
Ok(response.text().await?)
}
}
Error Handling and Retry Logic
use std::time::Duration;
use tokio::time::sleep;
impl AuthenticatedScraper {
pub async fn robust_scrape(&self, url: &str, max_retries: u32) -> Result<String> {
let mut attempts = 0;
loop {
match self.client.get(url).send().await {
Ok(response) => {
if response.status().is_success() {
return Ok(response.text().await?);
} else if response.status().as_u16() == 401 {
anyhow::bail!("Authentication expired");
}
}
Err(e) if attempts < max_retries => {
attempts += 1;
println!("Request failed, retrying in 2 seconds... (attempt {}/{})",
attempts, max_retries);
sleep(Duration::from_secs(2)).await;
continue;
}
Err(e) => return Err(e.into()),
}
}
}
}
Rate Limiting
use std::sync::Arc;
use tokio::sync::Semaphore;
pub struct RateLimitedScraper {
scraper: AuthenticatedScraper,
semaphore: Arc<Semaphore>,
}
impl RateLimitedScraper {
pub fn new(scraper: AuthenticatedScraper, concurrent_requests: usize) -> Self {
Self {
scraper,
semaphore: Arc::new(Semaphore::new(concurrent_requests)),
}
}
pub async fn scrape_with_rate_limit(&self, url: &str) -> Result<String> {
let _permit = self.semaphore.acquire().await?;
// Add delay between requests
sleep(Duration::from_millis(500)).await;
self.scraper.scrape_protected_page(url).await
.map(|data| data.join("\n"))
}
}
Best Practices
- Store credentials securely - Use environment variables or secure configuration files
- Handle session expiration - Implement automatic re-authentication
- Respect rate limits - Add delays between requests to avoid being blocked
- Use proper error handling - Implement retry logic and graceful failure handling
- Monitor your requests - Log important events for debugging
- Test thoroughly - Verify your scraper works with different scenarios
Common Issues and Solutions
- Session expires: Implement session renewal logic
- CSRF tokens change: Extract tokens dynamically for each request
- Rate limiting: Add exponential backoff and respect HTTP 429 responses
- JavaScript-rendered content: Consider using headless browsers like
chromiumoxide
This approach provides a robust foundation for scraping authenticated websites while handling common challenges like CSRF protection, session management, and error recovery.