What are the best practices for using MechanicalSoup in production?
When deploying MechanicalSoup applications to production environments, following established best practices is crucial for reliability, performance, and maintainability. This comprehensive guide covers the essential practices for running MechanicalSoup scrapers in production systems.
1. Robust Error Handling and Recovery
Production environments require comprehensive error handling to ensure your scrapers can handle unexpected situations gracefully.
Implement Comprehensive Exception Handling
import mechanicalsoup
import requests
import time
import logging
from urllib.parse import urljoin
def robust_scraper():
browser = mechanicalsoup.StatefulBrowser()
try:
# Configure browser with production settings
browser.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Set timeouts
browser.session.timeout = (10, 30) # connect, read timeouts
response = browser.open("https://example.com")
response.raise_for_status()
# Process the page
page = browser.get_current_page()
return extract_data(page)
except requests.exceptions.Timeout:
logging.error("Request timed out")
return None
except requests.exceptions.ConnectionError:
logging.error("Connection error occurred")
return None
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP error: {e.response.status_code}")
return None
except Exception as e:
logging.error(f"Unexpected error: {str(e)}")
return None
finally:
browser.close()
Implement Retry Logic with Exponential Backoff
import random
from functools import wraps
def retry_with_backoff(max_retries=3, base_delay=1, max_delay=60):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except (requests.exceptions.RequestException, Exception) as e:
if attempt == max_retries - 1:
raise e
delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
logging.warning(f"Attempt {attempt + 1} failed. Retrying in {delay:.2f} seconds")
time.sleep(delay)
return None
return wrapper
return decorator
@retry_with_backoff(max_retries=3)
def scrape_with_retry(url):
browser = mechanicalsoup.StatefulBrowser()
browser.open(url)
return browser.get_current_page()
2. Session Management and Resource Optimization
Proper session management is critical for production performance and memory efficiency.
Use Session Pooling and Reuse
class ProductionScraper:
def __init__(self):
self.browser = mechanicalsoup.StatefulBrowser()
self.configure_session()
def configure_session(self):
# Configure connection pooling
adapter = requests.adapters.HTTPAdapter(
pool_connections=10,
pool_maxsize=20,
max_retries=3
)
self.browser.session.mount('http://', adapter)
self.browser.session.mount('https://', adapter)
# Set session-wide headers
self.browser.session.headers.update({
'User-Agent': 'ProductionBot/1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
})
def scrape_multiple_pages(self, urls):
results = []
for url in urls:
try:
response = self.browser.open(url)
if response.status_code == 200:
results.append(self.extract_data(self.browser.get_current_page()))
time.sleep(1) # Rate limiting
except Exception as e:
logging.error(f"Error scraping {url}: {str(e)}")
continue
return results
def close(self):
self.browser.close()
Memory Management
import gc
import psutil
import os
class MemoryOptimizedScraper:
def __init__(self, memory_threshold_mb=500):
self.memory_threshold = memory_threshold_mb
self.browser = mechanicalsoup.StatefulBrowser()
def check_memory_usage(self):
process = psutil.Process(os.getpid())
memory_mb = process.memory_info().rss / 1024 / 1024
if memory_mb > self.memory_threshold:
logging.warning(f"Memory usage high: {memory_mb:.2f}MB")
self.cleanup_resources()
def cleanup_resources(self):
# Clear browser cache
self.browser.session.cookies.clear()
# Force garbage collection
gc.collect()
# Recreate browser if needed
self.browser.close()
self.browser = mechanicalsoup.StatefulBrowser()
def scrape_with_monitoring(self, url):
self.check_memory_usage()
response = self.browser.open(url)
return self.browser.get_current_page()
3. Rate Limiting and Respectful Scraping
Production scrapers must implement proper rate limiting to avoid overwhelming target servers and prevent IP blocking.
Advanced Rate Limiting
import threading
from collections import defaultdict
from datetime import datetime, timedelta
class RateLimiter:
def __init__(self):
self.requests = defaultdict(list)
self.lock = threading.Lock()
def can_make_request(self, domain, max_requests=10, window_seconds=60):
with self.lock:
now = datetime.now()
cutoff = now - timedelta(seconds=window_seconds)
# Remove old requests
self.requests[domain] = [
req_time for req_time in self.requests[domain]
if req_time > cutoff
]
if len(self.requests[domain]) < max_requests:
self.requests[domain].append(now)
return True
return False
def wait_for_rate_limit(self, domain, max_requests=10, window_seconds=60):
while not self.can_make_request(domain, max_requests, window_seconds):
time.sleep(1)
class RateLimitedScraper:
def __init__(self):
self.browser = mechanicalsoup.StatefulBrowser()
self.rate_limiter = RateLimiter()
def scrape_url(self, url):
from urllib.parse import urlparse
domain = urlparse(url).netloc
# Wait for rate limit
self.rate_limiter.wait_for_rate_limit(domain)
try:
response = self.browser.open(url)
return self.browser.get_current_page()
except Exception as e:
logging.error(f"Error scraping {url}: {str(e)}")
return None
4. Monitoring and Logging
Comprehensive monitoring is essential for production deployments to track performance and identify issues.
Structured Logging
import logging
import json
from datetime import datetime
class StructuredLogger:
def __init__(self, name):
self.logger = logging.getLogger(name)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
self.logger.addHandler(handler)
self.logger.setLevel(logging.INFO)
def log_scraping_event(self, event_type, url, status_code=None,
response_time=None, error=None):
log_data = {
'timestamp': datetime.utcnow().isoformat(),
'event_type': event_type,
'url': url,
'status_code': status_code,
'response_time_ms': response_time,
'error': error
}
self.logger.info(json.dumps(log_data))
class MonitoredScraper:
def __init__(self):
self.browser = mechanicalsoup.StatefulBrowser()
self.logger = StructuredLogger('scraper')
self.metrics = {
'requests_made': 0,
'successful_requests': 0,
'failed_requests': 0,
'total_response_time': 0
}
def scrape_with_monitoring(self, url):
start_time = time.time()
try:
self.metrics['requests_made'] += 1
response = self.browser.open(url)
response_time = (time.time() - start_time) * 1000
self.logger.log_scraping_event(
'request_success',
url,
response.status_code,
response_time
)
self.metrics['successful_requests'] += 1
self.metrics['total_response_time'] += response_time
return self.browser.get_current_page()
except Exception as e:
response_time = (time.time() - start_time) * 1000
self.metrics['failed_requests'] += 1
self.logger.log_scraping_event(
'request_error',
url,
error=str(e),
response_time=response_time
)
raise e
def get_metrics(self):
if self.metrics['requests_made'] > 0:
avg_response_time = self.metrics['total_response_time'] / self.metrics['requests_made']
success_rate = self.metrics['successful_requests'] / self.metrics['requests_made']
return {
'total_requests': self.metrics['requests_made'],
'success_rate': success_rate,
'average_response_time_ms': avg_response_time,
'failed_requests': self.metrics['failed_requests']
}
return self.metrics
5. Configuration Management
Use environment-based configuration for production deployments.
Environment Configuration
import os
from dataclasses import dataclass
@dataclass
class ScraperConfig:
user_agent: str = os.getenv('SCRAPER_USER_AGENT', 'MechanicalSoup/1.0')
request_timeout: int = int(os.getenv('REQUEST_TIMEOUT', '30'))
max_retries: int = int(os.getenv('MAX_RETRIES', '3'))
rate_limit_requests: int = int(os.getenv('RATE_LIMIT_REQUESTS', '10'))
rate_limit_window: int = int(os.getenv('RATE_LIMIT_WINDOW', '60'))
proxy_url: str = os.getenv('PROXY_URL', '')
log_level: str = os.getenv('LOG_LEVEL', 'INFO')
class ConfigurableScraper:
def __init__(self, config: ScraperConfig = None):
self.config = config or ScraperConfig()
self.browser = self.create_browser()
def create_browser(self):
browser = mechanicalsoup.StatefulBrowser()
# Apply configuration
browser.session.headers.update({
'User-Agent': self.config.user_agent
})
browser.session.timeout = self.config.request_timeout
if self.config.proxy_url:
browser.session.proxies = {
'http': self.config.proxy_url,
'https': self.config.proxy_url
}
return browser
6. Scalability and Performance
For high-volume production environments, consider implementing concurrent processing and caching strategies.
Concurrent Processing
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
import mechanicalsoup
class ScalableScraper:
def __init__(self, max_workers=5):
self.max_workers = max_workers
self.executor = ThreadPoolExecutor(max_workers=max_workers)
def scrape_single_url(self, url):
browser = mechanicalsoup.StatefulBrowser()
try:
response = browser.open(url)
return {
'url': url,
'status': 'success',
'data': self.extract_data(browser.get_current_page())
}
except Exception as e:
return {
'url': url,
'status': 'error',
'error': str(e)
}
finally:
browser.close()
def scrape_urls_parallel(self, urls):
future_to_url = {
self.executor.submit(self.scrape_single_url, url): url
for url in urls
}
results = []
for future in future_to_url:
try:
result = future.result(timeout=60)
results.append(result)
except Exception as e:
url = future_to_url[future]
results.append({
'url': url,
'status': 'timeout',
'error': str(e)
})
return results
7. Security Considerations
Production environments require additional security measures to protect both your application and target websites.
Security Best Practices
import ssl
import certifi
class SecureScraper:
def __init__(self):
self.browser = mechanicalsoup.StatefulBrowser()
self.configure_security()
def configure_security(self):
# Use system certificates
self.browser.session.verify = certifi.where()
# Configure SSL context
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = True
ssl_context.verify_mode = ssl.CERT_REQUIRED
# Set secure headers
self.browser.session.headers.update({
'DNT': '1', # Do Not Track
'Upgrade-Insecure-Requests': '1'
})
def scrape_securely(self, url):
# Validate URL before making request
if not self.is_safe_url(url):
raise ValueError(f"Unsafe URL: {url}")
return self.browser.open(url)
def is_safe_url(self, url):
# Add URL validation logic
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.scheme in ['http', 'https'] and parsed.netloc
When developing JavaScript-based scrapers, you might also consider how to handle timeouts in Puppeteer for similar production reliability patterns. For complex authentication scenarios, handling authentication in Puppeteer provides complementary approaches that can be adapted to MechanicalSoup workflows.
Deployment Considerations
Docker Configuration
FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV LOG_LEVEL=INFO
# Create non-root user
RUN useradd -r -s /bin/false scraper
USER scraper
CMD ["python", "scraper.py"]
Health Checks and Monitoring
from flask import Flask, jsonify
import threading
app = Flask(__name__)
@app.route('/health')
def health_check():
# Implement health check logic
return jsonify({
'status': 'healthy',
'timestamp': datetime.utcnow().isoformat()
})
@app.route('/metrics')
def metrics():
return jsonify(scraper_instance.get_metrics())
def run_health_server():
app.run(host='0.0.0.0', port=8080)
# Start health check server in separate thread
health_thread = threading.Thread(target=run_health_server)
health_thread.daemon = True
health_thread.start()
Conclusion
Production deployment of MechanicalSoup applications requires careful attention to error handling, resource management, monitoring, and security. By implementing these best practices, you can ensure your web scraping applications run reliably and efficiently in production environments while respecting target websites and maintaining system stability.
Remember to always comply with websites' robots.txt files, terms of service, and applicable laws when deploying scrapers to production. Regular monitoring and maintenance are essential for long-term success in production environments.