How to scrape a website with login authentication using Rust?

Scraping authenticated websites in Rust requires handling HTTP sessions, cookies, and authentication mechanisms. This guide covers the complete process from login to data extraction.

Prerequisites

Before scraping authenticated sites, you need to:

Analyze the login mechanism - Inspect the login form to identify field names, action URLs, and authentication type
Check for CSRF protection - Look for hidden tokens or anti-CSRF measures
Understand session management - Determine how the site maintains user sessions
Review terms of service - Ensure your scraping complies with the site's policies

Dependencies Setup

Add these dependencies to your Cargo.toml:

[dependencies]
reqwest = { version = "0.11", features = ["json", "cookies"] }
scraper = "0.17"
tokio = { version = "1", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
url = "2.4"
anyhow = "1.0"

Basic Authentication Example

Here's a complete example demonstrating authenticated web scraping:

use anyhow::{Context, Result};
use reqwest::{Client, Response};
use scraper::{Html, Selector};
use std::collections::HashMap;

pub struct AuthenticatedScraper {
    client: Client,
    login_url: String,
    base_url: String,
}

impl AuthenticatedScraper {
    pub fn new(base_url: String, login_url: String) -> Result<Self> {
        let client = Client::builder()
            .cookie_store(true)
            .user_agent("Mozilla/5.0 (compatible; RustScraper/1.0)")
            .build()
            .context("Failed to create HTTP client")?;

        Ok(Self {
            client,
            login_url,
            base_url,
        })
    }

    pub async fn login(&self, username: &str, password: &str) -> Result<()> {
        // First, get the login page to extract any CSRF tokens
        let login_page = self.client
            .get(&self.login_url)
            .send()
            .await
            .context("Failed to fetch login page")?;

        let login_html = login_page.text().await?;
        let document = Html::parse_document(&login_html);

        // Extract CSRF token if present
        let mut form_data = HashMap::new();
        form_data.insert("username", username);
        form_data.insert("password", password);

        // Look for common CSRF token field names
        if let Some(csrf_token) = self.extract_csrf_token(&document) {
            form_data.insert("_token", &csrf_token);
            // Try other common names
            form_data.insert("csrf_token", &csrf_token);
            form_data.insert("authenticity_token", &csrf_token);
        }

        // Submit login form
        let response = self.client
            .post(&self.login_url)
            .form(&form_data)
            .send()
            .await
            .context("Failed to submit login form")?;

        self.verify_login_success(&response).await
    }

    fn extract_csrf_token(&self, document: &Html) -> Option<String> {
        let selectors = [
            "input[name='_token']",
            "input[name='csrf_token']",
            "input[name='authenticity_token']",
            "meta[name='csrf-token']",
        ];

        for selector_str in &selectors {
            if let Ok(selector) = Selector::parse(selector_str) {
                if let Some(element) = document.select(&selector).next() {
                    if let Some(token) = element.value().attr("value")
                        .or_else(|| element.value().attr("content")) {
                        return Some(token.to_string());
                    }
                }
            }
        }
        None
    }

    async fn verify_login_success(&self, response: &Response) -> Result<()> {
        let status = response.status();
        let body = response.text().await?;

        // Check for successful login indicators
        if status.is_success() {
            if body.contains("dashboard") || body.contains("logout") {
                println!("Login successful!");
                return Ok(());
            }
        }

        // Check for error indicators
        if body.contains("invalid") || body.contains("error") || body.contains("incorrect") {
            anyhow::bail!("Login failed: Invalid credentials");
        }

        if status.is_redirection() {
            println!("Login successful (redirected)!");
            return Ok(());
        }

        anyhow::bail!("Login status unclear. Status: {}", status);
    }

    pub async fn scrape_protected_page(&self, url: &str) -> Result<Vec<String>> {
        let response = self.client
            .get(url)
            .send()
            .await
            .context("Failed to fetch protected page")?;

        let body = response.text().await?;
        let document = Html::parse_document(&body);

        // Extract data using CSS selectors
        let content_selector = Selector::parse(".content, .post, article")
            .context("Invalid CSS selector")?;

        let mut extracted_data = Vec::new();
        for element in document.select(&content_selector) {
            let text = element.text().collect::<Vec<_>>().join(" ");
            if !text.trim().is_empty() {
                extracted_data.push(text.trim().to_string());
            }
        }

        Ok(extracted_data)
    }
}

#[tokio::main]
async fn main() -> Result<()> {
    let scraper = AuthenticatedScraper::new(
        "https://example.com".to_string(),
        "https://example.com/login".to_string(),
    )?;

    // Login
    scraper.login("your_username", "your_password").await?;

    // Scrape protected content
    let data = scraper.scrape_protected_page("https://example.com/protected").await?;

    for item in data {
        println!("Extracted: {}", item);
    }

    Ok(())
}

Advanced Techniques

Handling Different Authentication Types

JWT Token Authentication

use serde::{Deserialize, Serialize};

#[derive(Deserialize)]
struct LoginResponse {
    token: String,
    expires_in: u64,
}

impl AuthenticatedScraper {
    pub async fn login_jwt(&self, username: &str, password: &str) -> Result<String> {
        let login_data = serde_json::json!({
            "username": username,
            "password": password
        });

        let response: LoginResponse = self.client
            .post(&self.login_url)
            .json(&login_data)
            .send()
            .await?
            .json()
            .await?;

        Ok(response.token)
    }

    pub async fn scrape_with_jwt(&self, url: &str, token: &str) -> Result<String> {
        let response = self.client
            .get(url)
            .bearer_auth(token)
            .send()
            .await?;

        Ok(response.text().await?)
    }
}

Session-Based Authentication with Custom Headers

impl AuthenticatedScraper {
    pub async fn scrape_with_headers(&self, url: &str) -> Result<String> {
        let response = self.client
            .get(url)
            .header("X-Requested-With", "XMLHttpRequest")
            .header("Referer", &self.base_url)
            .send()
            .await?;

        Ok(response.text().await?)
    }
}

Error Handling and Retry Logic

use std::time::Duration;
use tokio::time::sleep;

impl AuthenticatedScraper {
    pub async fn robust_scrape(&self, url: &str, max_retries: u32) -> Result<String> {
        let mut attempts = 0;

        loop {
            match self.client.get(url).send().await {
                Ok(response) => {
                    if response.status().is_success() {
                        return Ok(response.text().await?);
                    } else if response.status().as_u16() == 401 {
                        anyhow::bail!("Authentication expired");
                    }
                }
                Err(e) if attempts < max_retries => {
                    attempts += 1;
                    println!("Request failed, retrying in 2 seconds... (attempt {}/{})", 
                            attempts, max_retries);
                    sleep(Duration::from_secs(2)).await;
                    continue;
                }
                Err(e) => return Err(e.into()),
            }
        }
    }
}

Rate Limiting

use std::sync::Arc;
use tokio::sync::Semaphore;

pub struct RateLimitedScraper {
    scraper: AuthenticatedScraper,
    semaphore: Arc<Semaphore>,
}

impl RateLimitedScraper {
    pub fn new(scraper: AuthenticatedScraper, concurrent_requests: usize) -> Self {
        Self {
            scraper,
            semaphore: Arc::new(Semaphore::new(concurrent_requests)),
        }
    }

    pub async fn scrape_with_rate_limit(&self, url: &str) -> Result<String> {
        let _permit = self.semaphore.acquire().await?;

        // Add delay between requests
        sleep(Duration::from_millis(500)).await;

        self.scraper.scrape_protected_page(url).await
            .map(|data| data.join("\n"))
    }
}

Best Practices

Store credentials securely - Use environment variables or secure configuration files
Handle session expiration - Implement automatic re-authentication
Respect rate limits - Add delays between requests to avoid being blocked
Use proper error handling - Implement retry logic and graceful failure handling
Monitor your requests - Log important events for debugging
Test thoroughly - Verify your scraper works with different scenarios

Common Issues and Solutions

Session expires: Implement session renewal logic
CSRF tokens change: Extract tokens dynamically for each request
Rate limiting: Add exponential backoff and respect HTTP 429 responses
JavaScript-rendered content: Consider using headless browsers like chromiumoxide

This approach provides a robust foundation for scraping authenticated websites while handling common challenges like CSRF protection, session management, and error recovery.

Table of contents