How do I handle timeouts with MechanicalSoup?
Timeout handling is crucial for building robust web scrapers with MechanicalSoup. When scraping websites, you may encounter slow-responding servers, network issues, or unresponsive pages that can cause your scraper to hang indefinitely. Proper timeout configuration ensures your scraper remains responsive and handles network issues gracefully.
Understanding Different Types of Timeouts
MechanicalSoup, built on top of the requests
library, supports several types of timeouts that serve different purposes:
- Connection timeout: Time limit for establishing a connection to the server
- Read timeout: Time limit for receiving a response after connection is established
- Total timeout: Overall time limit for the entire request
Basic Timeout Configuration
Setting Default Timeouts
The most straightforward way to handle timeouts in MechanicalSoup is to configure them when creating the browser instance:
import mechanicalsoup
# Create browser with timeout configuration
browser = mechanicalsoup.StatefulBrowser()
# Configure session with timeout
browser.session.timeout = 30 # 30 seconds for all requests
# Alternative: Set both connection and read timeouts separately
browser.session.timeout = (10, 30) # (connection_timeout, read_timeout)
Per-Request Timeout Configuration
You can also set timeouts for individual requests:
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
# Open a page with specific timeout
try:
browser.open("https://example.com", timeout=15)
except Exception as e:
print(f"Request timed out: {e}")
Advanced Timeout Handling Strategies
Using Session Configuration
For more control over timeout behavior, configure the underlying requests session:
import mechanicalsoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Create browser instance
browser = mechanicalsoup.StatefulBrowser()
# Configure retry strategy with timeouts
retry_strategy = Retry(
total=3,
status_forcelist=[429, 500, 502, 503, 504],
backoff_factor=1,
respect_retry_after_header=True
)
# Mount adapter with retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)
browser.session.mount("http://", adapter)
browser.session.mount("https://", adapter)
# Set timeout for all requests
browser.session.timeout = (5, 30) # 5s connection, 30s read
Implementing Custom Timeout Handling
Create a wrapper function that handles different timeout scenarios:
import mechanicalsoup
import time
from requests.exceptions import ConnectTimeout, ReadTimeout, Timeout
def safe_open_page(browser, url, max_retries=3, base_timeout=10):
"""
Safely open a page with progressive timeout handling
"""
for attempt in range(max_retries):
try:
# Progressive timeout: increase timeout with each retry
timeout = base_timeout * (attempt + 1)
print(f"Attempt {attempt + 1}: Opening {url} with {timeout}s timeout")
response = browser.open(url, timeout=timeout)
if response.status_code == 200:
return response
else:
print(f"HTTP {response.status_code} received")
except ConnectTimeout:
print(f"Connection timeout on attempt {attempt + 1}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
continue
else:
raise
except ReadTimeout:
print(f"Read timeout on attempt {attempt + 1}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
continue
else:
raise
except Timeout as e:
print(f"General timeout on attempt {attempt + 1}: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
continue
else:
raise
raise Exception(f"Failed to open {url} after {max_retries} attempts")
# Usage example
browser = mechanicalsoup.StatefulBrowser()
try:
response = safe_open_page(browser, "https://slow-website.com", max_retries=3)
print("Page loaded successfully")
except Exception as e:
print(f"Failed to load page: {e}")
Handling Form Submission Timeouts
When submitting forms, timeouts become especially important as server processing time can vary:
import mechanicalsoup
from requests.exceptions import Timeout
browser = mechanicalsoup.StatefulBrowser()
# Configure longer timeout for form submissions
browser.session.timeout = (10, 60) # Allow up to 60 seconds for processing
try:
# Open login page
browser.open("https://example.com/login")
# Fill and submit form with timeout handling
browser.select_form('form[action="/login"]')
browser["username"] = "your_username"
browser["password"] = "your_password"
# Submit with extended timeout for processing
response = browser.submit_selected(timeout=120)
if "dashboard" in response.url:
print("Login successful")
else:
print("Login may have failed")
except Timeout:
print("Form submission timed out")
except Exception as e:
print(f"Error during form submission: {e}")
Monitoring and Debugging Timeouts
Adding Timeout Logging
Implement comprehensive logging to track timeout patterns:
import mechanicalsoup
import logging
import time
from requests.exceptions import Timeout, ConnectTimeout, ReadTimeout
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class TimeoutMechanicalSoup:
def __init__(self, connection_timeout=10, read_timeout=30):
self.browser = mechanicalsoup.StatefulBrowser()
self.browser.session.timeout = (connection_timeout, read_timeout)
self.timeout_stats = {'total': 0, 'connection': 0, 'read': 0}
def open_with_monitoring(self, url):
start_time = time.time()
try:
response = self.browser.open(url)
elapsed = time.time() - start_time
logger.info(f"Successfully loaded {url} in {elapsed:.2f}s")
return response
except ConnectTimeout:
self.timeout_stats['connection'] += 1
self.timeout_stats['total'] += 1
elapsed = time.time() - start_time
logger.error(f"Connection timeout for {url} after {elapsed:.2f}s")
raise
except ReadTimeout:
self.timeout_stats['read'] += 1
self.timeout_stats['total'] += 1
elapsed = time.time() - start_time
logger.error(f"Read timeout for {url} after {elapsed:.2f}s")
raise
def get_timeout_stats(self):
return self.timeout_stats
# Usage
scraper = TimeoutMechanicalSoup(connection_timeout=5, read_timeout=30)
urls = ["https://site1.com", "https://site2.com", "https://slow-site.com"]
for url in urls:
try:
scraper.open_with_monitoring(url)
except Timeout:
print(f"Skipping {url} due to timeout")
print("Timeout statistics:", scraper.get_timeout_stats())
Best Practices for Timeout Configuration
1. Environment-Specific Timeouts
Configure different timeouts based on your environment:
import os
import mechanicalsoup
def create_browser_with_environment_timeouts():
# Get environment or use defaults
env = os.getenv('ENVIRONMENT', 'development')
if env == 'production':
# More conservative timeouts for production
connection_timeout = 5
read_timeout = 20
elif env == 'testing':
# Shorter timeouts for testing
connection_timeout = 2
read_timeout = 10
else:
# Development timeouts
connection_timeout = 10
read_timeout = 30
browser = mechanicalsoup.StatefulBrowser()
browser.session.timeout = (connection_timeout, read_timeout)
return browser
2. Adaptive Timeout Strategy
Implement timeouts that adapt based on website performance:
import mechanicalsoup
import time
from collections import defaultdict
class AdaptiveTimeoutBrowser:
def __init__(self):
self.browser = mechanicalsoup.StatefulBrowser()
self.site_performance = defaultdict(list)
self.base_timeout = 10
def get_adaptive_timeout(self, url):
domain = url.split('/')[2]
recent_times = self.site_performance[domain][-5:] # Last 5 requests
if recent_times:
avg_time = sum(recent_times) / len(recent_times)
# Set timeout to 3x average response time, minimum 5 seconds
return max(5, int(avg_time * 3))
else:
return self.base_timeout
def open_adaptive(self, url):
timeout = self.get_adaptive_timeout(url)
domain = url.split('/')[2]
start_time = time.time()
try:
response = self.browser.open(url, timeout=timeout)
elapsed = time.time() - start_time
self.site_performance[domain].append(elapsed)
return response
except Exception as e:
print(f"Failed to load {url} with {timeout}s timeout: {e}")
raise
Integration with Error Handling
Combine timeout handling with comprehensive error management, similar to how timeouts are handled in Puppeteer:
import mechanicalsoup
from requests.exceptions import Timeout, ConnectionError, HTTPError
import time
def robust_scraping_session(urls, max_workers=3):
"""
Scrape multiple URLs with robust timeout and error handling
"""
browser = mechanicalsoup.StatefulBrowser()
browser.session.timeout = (5, 30)
results = []
for url in urls:
retries = 3
for attempt in range(retries):
try:
response = browser.open(url)
# Process the page
soup = browser.get_current_page()
title = soup.find('title')
results.append({
'url': url,
'status': 'success',
'title': title.text if title else 'No title',
'attempt': attempt + 1
})
break
except Timeout:
if attempt < retries - 1:
wait_time = 2 ** attempt
print(f"Timeout for {url}, retrying in {wait_time}s...")
time.sleep(wait_time)
else:
results.append({
'url': url,
'status': 'timeout',
'error': 'Request timed out after all retries'
})
except (ConnectionError, HTTPError) as e:
results.append({
'url': url,
'status': 'error',
'error': str(e)
})
break
return results
Conclusion
Proper timeout handling in MechanicalSoup is essential for building reliable web scrapers. By configuring appropriate connection and read timeouts, implementing retry strategies, and monitoring timeout patterns, you can create robust scrapers that handle network issues gracefully.
Key takeaways: - Always set reasonable timeouts to prevent hanging requests - Use different timeout values for connection and read operations - Implement retry logic with exponential backoff for temporary failures - Monitor timeout patterns to optimize your scraper's performance - Adapt timeout values based on target website characteristics
For more complex scenarios involving JavaScript-heavy sites, consider complementing MechanicalSoup with tools like Puppeteer for handling AJAX requests where timeout handling strategies can be even more sophisticated.