apiVersion: v1 kind: Service metadata: name: rust-scraper-service spec: selector: app: rust-scraper ports: - port: 80 targetPort: 8080 type: LoadBalancer ```
Configuration Management
Environment-based Configuration
config.rs:
use serde::{Deserialize, Serialize};
use std::env;
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct Config {
pub server: ServerConfig,
pub database: DatabaseConfig,
pub scraping: ScrapingConfig,
pub monitoring: MonitoringConfig,
}
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct ServerConfig {
pub host: String,
pub port: u16,
pub workers: usize,
}
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct ScrapingConfig {
pub max_concurrent_requests: usize,
pub request_timeout: u64,
pub retry_attempts: u32,
pub rate_limit_per_second: u32,
}
impl Config {
pub fn from_env() -> Result<Self, Box<dyn std::error::Error>> {
Ok(Config {
server: ServerConfig {
host: env::var("HOST").unwrap_or_else(|_| "0.0.0.0".to_string()),
port: env::var("PORT")
.unwrap_or_else(|_| "8080".to_string())
.parse()?,
workers: env::var("WORKERS")
.unwrap_or_else(|_| "4".to_string())
.parse()?,
},
database: DatabaseConfig {
url: env::var("DATABASE_URL")
.expect("DATABASE_URL must be set"),
max_connections: env::var("DB_MAX_CONNECTIONS")
.unwrap_or_else(|_| "10".to_string())
.parse()?,
},
scraping: ScrapingConfig {
max_concurrent_requests: env::var("MAX_CONCURRENT_REQUESTS")
.unwrap_or_else(|_| "100".to_string())
.parse()?,
request_timeout: env::var("REQUEST_TIMEOUT")
.unwrap_or_else(|_| "30".to_string())
.parse()?,
retry_attempts: env::var("RETRY_ATTEMPTS")
.unwrap_or_else(|_| "3".to_string())
.parse()?,
rate_limit_per_second: env::var("RATE_LIMIT_PER_SECOND")
.unwrap_or_else(|_| "10".to_string())
.parse()?,
},
monitoring: MonitoringConfig {
metrics_enabled: env::var("METRICS_ENABLED")
.unwrap_or_else(|_| "true".to_string())
.parse()?,
jaeger_endpoint: env::var("JAEGER_ENDPOINT").ok(),
},
})
}
}
Monitoring and Observability
Metrics with Prometheus
metrics.rs:
use prometheus::{Counter, Histogram, IntGauge, Registry, Encoder, TextEncoder};
use std::sync::Arc;
#[derive(Clone)]
pub struct Metrics {
pub requests_total: Counter,
pub request_duration: Histogram,
pub active_scrapers: IntGauge,
pub successful_scrapes: Counter,
pub failed_scrapes: Counter,
registry: Arc<Registry>,
}
impl Metrics {
pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
let registry = Arc::new(Registry::new());
let requests_total = Counter::new(
"http_requests_total",
"Total number of HTTP requests"
)?;
let request_duration = Histogram::new(
"http_request_duration_seconds",
"HTTP request duration in seconds"
)?;
let active_scrapers = IntGauge::new(
"active_scrapers",
"Number of active scraping tasks"
)?;
let successful_scrapes = Counter::new(
"successful_scrapes_total",
"Total number of successful scrapes"
)?;
let failed_scrapes = Counter::new(
"failed_scrapes_total",
"Total number of failed scrapes"
)?;
registry.register(Box::new(requests_total.clone()))?;
registry.register(Box::new(request_duration.clone()))?;
registry.register(Box::new(active_scrapers.clone()))?;
registry.register(Box::new(successful_scrapes.clone()))?;
registry.register(Box::new(failed_scrapes.clone()))?;
Ok(Metrics {
requests_total,
request_duration,
active_scrapers,
successful_scrapes,
failed_scrapes,
registry,
})
}
pub fn export(&self) -> Result<String, Box<dyn std::error::Error>> {
let encoder = TextEncoder::new();
let metric_families = self.registry.gather();
let mut buffer = Vec::new();
encoder.encode(&metric_families, &mut buffer)?;
Ok(String::from_utf8(buffer)?)
}
}
Structured Logging
logging.rs:
use tracing::{info, error, warn, debug};
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
use serde_json::json;
pub fn init_logging() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| "info".into()),
)
.with(tracing_subscriber::fmt::layer().json())
.init();
Ok(())
}
pub fn log_scraping_result(url: &str, success: bool, duration_ms: u64, size_bytes: usize) {
if success {
info!(
url = url,
duration_ms = duration_ms,
size_bytes = size_bytes,
"Scraping completed successfully"
);
} else {
error!(
url = url,
duration_ms = duration_ms,
"Scraping failed"
);
}
}
Production Optimizations
Resource Management
connection_pool.rs:
use deadpool_postgres::{Config, Pool, Runtime};
use tokio_postgres::NoTls;
pub async fn create_db_pool(database_url: &str, max_size: usize) -> Result<Pool, Box<dyn std::error::Error>> {
let mut cfg = Config::new();
cfg.url = Some(database_url.to_string());
cfg.pool = Some(deadpool_postgres::PoolConfig {
max_size,
timeouts: deadpool_postgres::Timeouts::default(),
});
let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls)?;
Ok(pool)
}
// Rate limiting with token bucket
use std::sync::Arc;
use tokio::sync::Semaphore;
use tokio::time::{Duration, Interval, interval};
pub struct RateLimiter {
semaphore: Arc<Semaphore>,
_refill_task: tokio::task::JoinHandle<()>,
}
impl RateLimiter {
pub fn new(permits_per_second: usize) -> Self {
let semaphore = Arc::new(Semaphore::new(permits_per_second));
let semaphore_clone = semaphore.clone();
let refill_task = tokio::spawn(async move {
let mut interval = interval(Duration::from_secs(1));
loop {
interval.tick().await;
semaphore_clone.add_permits(permits_per_second);
}
});
Self {
semaphore,
_refill_task: refill_task,
}
}
pub async fn acquire(&self) -> Result<(), Box<dyn std::error::Error>> {
self.semaphore.acquire().await?.forget();
Ok(())
}
}
Deployment Automation
GitHub Actions CI/CD
.github/workflows/deploy.yml:
name: Deploy to Production
on:
push:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
- run: cargo test --verbose
build-and-push:
needs: test
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- name: Log in to Container Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=sha
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
deploy:
needs: build-and-push
runs-on: ubuntu-latest
steps:
- name: Deploy to ECS
run: |
aws ecs update-service \
--cluster production \
--service rust-scraper \
--force-new-deployment
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-west-2
Health Checks and Graceful Shutdown
health.rs:
use axum::{Json, response::Json as ResponseJson};
use serde_json::{json, Value};
use std::sync::Arc;
use tokio::signal;
pub async fn health_check() -> ResponseJson<Value> {
ResponseJson(json!({"status": "healthy", "timestamp": chrono::Utc::now()}))
}
pub async fn readiness_check(db_pool: Arc<deadpool_postgres::Pool>) -> ResponseJson<Value> {
match db_pool.get().await {
Ok(_) => ResponseJson(json!({"status": "ready"})),
Err(_) => ResponseJson(json!({"status": "not ready", "reason": "database unavailable"})),
}
}
pub async fn shutdown_signal() {
let ctrl_c = async {
signal::ctrl_c()
.await
.expect("failed to install Ctrl+C handler");
};
#[cfg(unix)]
let terminate = async {
signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("failed to install signal handler")
.recv()
.await;
};
#[cfg(not(unix))]
let terminate = std::future::pending::<()>();
tokio::select! {
_ = ctrl_c => {},
_ = terminate => {},
}
println!("Shutdown signal received, starting graceful shutdown");
}
Security Considerations
Production Security Setup
// Security headers middleware
use axum::{
http::{HeaderMap, HeaderName, HeaderValue},
middleware::Next,
response::Response,
Request,
};
pub async fn security_headers<B>(request: Request<B>, next: Next<B>) -> Response {
let mut response = next.run(request).await;
let headers = response.headers_mut();
headers.insert(
HeaderName::from_static("x-content-type-options"),
HeaderValue::from_static("nosniff"),
);
headers.insert(
HeaderName::from_static("x-frame-options"),
HeaderValue::from_static("DENY"),
);
headers.insert(
HeaderName::from_static("x-xss-protection"),
HeaderValue::from_static("1; mode=block"),
);
response
}
// Request timeout middleware
pub async fn timeout_middleware<B>(request: Request<B>, next: Next<B>) -> Response {
let timeout = Duration::from_secs(30);
match tokio::time::timeout(timeout, next.run(request)).await {
Ok(response) => response,
Err(_) => {
let mut response = Response::new("Request timeout".into());
*response.status_mut() = StatusCode::REQUEST_TIMEOUT;
response
}
}
}
Best Practices Summary
- Use Multi-stage Docker Builds: Minimize image size and attack surface
- Implement Graceful Shutdown: Handle SIGTERM signals properly
- Monitor Resource Usage: Track memory, CPU, and network metrics
- Use Connection Pooling: Optimize database and HTTP client connections
- Implement Circuit Breakers: Prevent cascade failures
- Security Headers: Add appropriate HTTP security headers
- Structured Logging: Use JSON logging for better observability
- Health Checks: Implement proper liveness and readiness probes
Similar to how you might use Puppeteer with Docker for Node.js applications, Rust applications benefit from containerization for consistent deployments. When dealing with complex scraping scenarios that require handling timeouts, Rust's robust error handling and async capabilities provide excellent production reliability.
By following these deployment strategies and best practices, your Rust web scraping applications will be well-positioned for production success with proper monitoring, security, and scalability considerations in place.