What is the Best Way to Handle Errors and Exceptions in MechanicalSoup?
Error handling is crucial for building robust web scraping applications with MechanicalSoup. Websites can be unpredictable, connections can fail, and pages can change structure unexpectedly. Implementing proper error handling ensures your scrapers are resilient and can gracefully recover from various failure scenarios.
Understanding Common MechanicalSoup Exceptions
MechanicalSoup inherits many exceptions from its underlying libraries (requests and BeautifulSoup). Here are the most common exceptions you'll encounter:
Network-Related Exceptions
import mechanicalsoup
import requests
from requests.exceptions import (
ConnectionError,
Timeout,
HTTPError,
RequestException
)
# Connection failures
try:
browser = mechanicalsoup.StatefulBrowser()
page = browser.open("https://nonexistent-website.com")
except ConnectionError as e:
print(f"Failed to connect: {e}")
except Timeout as e:
print(f"Request timed out: {e}")
except RequestException as e:
print(f"Request failed: {e}")
HTTP Status Code Errors
import mechanicalsoup
from requests.exceptions import HTTPError
def handle_http_errors(url):
browser = mechanicalsoup.StatefulBrowser()
browser.set_user_agent("Mozilla/5.0 (compatible; Web Scraper)")
try:
response = browser.open(url)
# Check for HTTP errors
if response.status_code >= 400:
if response.status_code == 404:
raise HTTPError(f"Page not found: {url}")
elif response.status_code == 403:
raise HTTPError(f"Access forbidden: {url}")
elif response.status_code >= 500:
raise HTTPError(f"Server error: {response.status_code}")
else:
raise HTTPError(f"HTTP error: {response.status_code}")
return response
except HTTPError as e:
print(f"HTTP Error: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None
Parsing and Element Selection Errors
import mechanicalsoup
from bs4 import BeautifulSoup
def safe_element_extraction(browser, selector):
try:
page = browser.get_current_page()
# Safe element selection
elements = page.select(selector)
if not elements:
raise ValueError(f"No elements found with selector: {selector}")
return elements
except AttributeError as e:
print(f"Element attribute error: {e}")
return []
except ValueError as e:
print(f"Element selection error: {e}")
return []
except Exception as e:
print(f"Parsing error: {e}")
return []
Comprehensive Error Handling Strategy
1. Multi-Layer Exception Handling
Implement error handling at multiple levels for maximum robustness:
import mechanicalsoup
import time
import logging
from requests.exceptions import RequestException, Timeout, ConnectionError
from urllib.parse import urljoin
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RobustScraper:
def __init__(self, base_url, timeout=30, retries=3):
self.base_url = base_url
self.browser = mechanicalsoup.StatefulBrowser()
self.timeout = timeout
self.retries = retries
# Configure browser settings
self.browser.set_user_agent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
self.browser.session.timeout = timeout
def safe_request(self, url, method='GET', **kwargs):
"""Make a safe HTTP request with retry logic"""
full_url = urljoin(self.base_url, url)
for attempt in range(self.retries + 1):
try:
if method.upper() == 'GET':
response = self.browser.open(full_url, **kwargs)
else:
response = self.browser.session.request(method, full_url, **kwargs)
# Validate response
response.raise_for_status()
logger.info(f"Successfully retrieved: {full_url}")
return response
except ConnectionError as e:
logger.warning(f"Connection error (attempt {attempt + 1}): {e}")
if attempt < self.retries:
time.sleep(2 ** attempt) # Exponential backoff
continue
else:
logger.error(f"Failed to connect after {self.retries + 1} attempts")
raise
except Timeout as e:
logger.warning(f"Timeout error (attempt {attempt + 1}): {e}")
if attempt < self.retries:
time.sleep(2 ** attempt)
continue
else:
logger.error(f"Request timed out after {self.retries + 1} attempts")
raise
except Exception as e:
logger.error(f"Unexpected error: {e}")
raise
def safe_extract_data(self, selectors):
"""Safely extract data using CSS selectors"""
results = {}
page = self.browser.get_current_page()
if not page:
raise RuntimeError("No page loaded")
for key, selector in selectors.items():
try:
elements = page.select(selector)
if elements:
# Extract text or attributes safely
if len(elements) == 1:
results[key] = elements[0].get_text(strip=True)
else:
results[key] = [elem.get_text(strip=True) for elem in elements]
else:
logger.warning(f"No elements found for selector '{selector}'")
results[key] = None
except Exception as e:
logger.error(f"Error extracting '{key}' with selector '{selector}': {e}")
results[key] = None
return results
2. Custom Exception Classes
Create custom exceptions for better error categorization:
class ScrapingError(Exception):
"""Base exception for scraping errors"""
pass
class PageNotFoundError(ScrapingError):
"""Raised when a page is not found"""
pass
class ElementNotFoundError(ScrapingError):
"""Raised when required elements are not found"""
pass
class RateLimitError(ScrapingError):
"""Raised when rate limiting is detected"""
pass
def enhanced_scraper(url, required_selectors):
browser = mechanicalsoup.StatefulBrowser()
try:
# Request the page
response = browser.open(url)
if response.status_code == 404:
raise PageNotFoundError(f"Page not found: {url}")
elif response.status_code == 429:
raise RateLimitError("Rate limit exceeded")
elif response.status_code >= 400:
raise ScrapingError(f"HTTP error {response.status_code}: {url}")
# Extract required data
page = browser.get_current_page()
results = {}
for key, selector in required_selectors.items():
elements = page.select(selector)
if not elements:
raise ElementNotFoundError(f"Required element '{key}' not found")
results[key] = elements[0].get_text(strip=True)
return results
except (PageNotFoundError, ElementNotFoundError, RateLimitError):
# Re-raise custom exceptions
raise
except Exception as e:
# Wrap unexpected exceptions
raise ScrapingError(f"Unexpected error: {e}") from e
3. Form Submission Error Handling
Handle form-related errors specifically:
def safe_form_submission(browser, form_selector, form_data):
"""Safely submit forms with error handling"""
try:
# Find the form
page = browser.get_current_page()
forms = page.select(form_selector)
if not forms:
raise ElementNotFoundError(f"Form not found: {form_selector}")
form = forms[0]
# Fill form fields safely
for field_name, value in form_data.items():
try:
field = form.find('input', {'name': field_name}) or \
form.find('select', {'name': field_name}) or \
form.find('textarea', {'name': field_name})
if not field:
logger.warning(f"Field '{field_name}' not found in form")
continue
if field.name == 'input':
field['value'] = value
elif field.name == 'textarea':
field.string = value
# Handle select fields separately if needed
except Exception as e:
logger.error(f"Error setting field '{field_name}': {e}")
continue
# Submit the form
response = browser.submit(form)
if response.status_code >= 400:
raise HTTPError(f"Form submission failed: {response.status_code}")
return response
except ElementNotFoundError:
raise
except Exception as e:
raise ScrapingError(f"Form submission error: {e}") from e
Advanced Error Handling Patterns
Circuit Breaker Pattern
Implement a circuit breaker to handle persistent failures:
import time
from enum import Enum
class CircuitState(Enum):
CLOSED = "closed"
OPEN = "open"
HALF_OPEN = "half_open"
class CircuitBreaker:
def __init__(self, failure_threshold=5, timeout=60):
self.failure_threshold = failure_threshold
self.timeout = timeout
self.failure_count = 0
self.last_failure_time = None
self.state = CircuitState.CLOSED
def call(self, func, *args, **kwargs):
if self.state == CircuitState.OPEN:
if time.time() - self.last_failure_time > self.timeout:
self.state = CircuitState.HALF_OPEN
else:
raise ScrapingError("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
self.on_success()
return result
except Exception as e:
self.on_failure()
raise
def on_success(self):
self.failure_count = 0
self.state = CircuitState.CLOSED
def on_failure(self):
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
# Usage example
circuit_breaker = CircuitBreaker()
def scrape_with_circuit_breaker(url):
browser = mechanicalsoup.StatefulBrowser()
def make_request():
return browser.open(url)
try:
response = circuit_breaker.call(make_request)
return response
except ScrapingError as e:
logger.error(f"Circuit breaker prevented request: {e}")
return None
Retry with Exponential Backoff
Implement sophisticated retry logic:
import random
from functools import wraps
def retry_with_backoff(max_retries=3, base_delay=1, max_delay=60, exceptions=(Exception,)):
"""Decorator for retry logic with exponential backoff"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries + 1):
try:
return func(*args, **kwargs)
except exceptions as e:
if attempt == max_retries:
logger.error(f"Failed after {max_retries + 1} attempts: {e}")
raise
delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.2f}s")
time.sleep(delay)
return None
return wrapper
return decorator
# Usage
@retry_with_backoff(max_retries=3, exceptions=(ConnectionError, Timeout))
def reliable_scrape(url):
browser = mechanicalsoup.StatefulBrowser()
return browser.open(url)
Best Practices for Error Handling
1. Logging and Monitoring
import logging
from datetime import datetime
# Configure structured logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
def log_scraping_activity(url, success, error_msg=None, duration=None):
"""Log scraping activities for monitoring"""
log_data = {
'timestamp': datetime.now().isoformat(),
'url': url,
'success': success,
'duration': duration,
'error': error_msg
}
if success:
logger.info(f"Scraping successful: {log_data}")
else:
logger.error(f"Scraping failed: {log_data}")
2. Graceful Degradation
def scrape_with_fallbacks(url, primary_selectors, fallback_selectors):
"""Scrape with fallback selectors for resilience"""
browser = mechanicalsoup.StatefulBrowser()
try:
response = browser.open(url)
page = browser.get_current_page()
results = {}
# Try primary selectors first
for key, selector in primary_selectors.items():
try:
elements = page.select(selector)
if elements:
results[key] = elements[0].get_text(strip=True)
continue
except Exception:
pass
# Fall back to alternative selectors
if key in fallback_selectors:
try:
elements = page.select(fallback_selectors[key])
if elements:
results[key] = elements[0].get_text(strip=True)
logger.info(f"Used fallback selector for '{key}'")
else:
results[key] = None
logger.warning(f"Both primary and fallback selectors failed for '{key}'")
except Exception as e:
logger.error(f"Fallback selector also failed for '{key}': {e}")
results[key] = None
return results
except Exception as e:
logger.error(f"Complete scraping failure for {url}: {e}")
return None
Error Handling for Different Scenarios
Handling JavaScript-Heavy Sites
While MechanicalSoup doesn't execute JavaScript, you can implement fallbacks for dynamic content similar to how Puppeteer handles errors:
def handle_dynamic_content(url, static_selectors, api_endpoints=None):
"""Handle sites that might have dynamic content"""
browser = mechanicalsoup.StatefulBrowser()
try:
response = browser.open(url)
page = browser.get_current_page()
# Try static content extraction
results = {}
for key, selector in static_selectors.items():
elements = page.select(selector)
if elements:
results[key] = elements[0].get_text(strip=True)
else:
results[key] = None
# Check if we got meaningful data
if all(value is None for value in results.values()):
logger.warning("No static content found, site might be JavaScript-heavy")
# Try API endpoints as fallback
if api_endpoints:
results = try_api_fallback(api_endpoints)
return results
except Exception as e:
logger.error(f"Error handling dynamic content: {e}")
return None
def try_api_fallback(api_endpoints):
"""Try to get data from API endpoints"""
browser = mechanicalsoup.StatefulBrowser()
results = {}
for key, endpoint in api_endpoints.items():
try:
response = browser.open(endpoint)
if response.status_code == 200:
# Assume JSON response
data = response.json()
results[key] = data
else:
results[key] = None
except Exception as e:
logger.error(f"API fallback failed for {key}: {e}")
results[key] = None
return results
Memory Management Error Handling
import psutil
import gc
def memory_aware_scraping(urls, memory_threshold=80):
"""Scrape with memory monitoring"""
browser = mechanicalsoup.StatefulBrowser()
results = []
for i, url in enumerate(urls):
# Check memory usage
memory_percent = psutil.virtual_memory().percent
if memory_percent > memory_threshold:
logger.warning(f"High memory usage ({memory_percent}%). Cleaning up...")
# Force garbage collection
gc.collect()
# Recreate browser instance to free memory
del browser
browser = mechanicalsoup.StatefulBrowser()
try:
response = browser.open(url)
# Process the page...
results.append({"url": url, "status": "success"})
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
results.append({"url": url, "status": "error", "error": str(e)})
# Progress logging
if (i + 1) % 10 == 0:
logger.info(f"Processed {i + 1}/{len(urls)} URLs")
return results
Conclusion
Effective error handling in MechanicalSoup requires a multi-layered approach that addresses network issues, parsing errors, and application-specific failures. By implementing proper exception handling, retry logic, circuit breakers, and fallback mechanisms, you can build robust web scrapers that gracefully handle various failure scenarios.
Key takeaways for error handling in MechanicalSoup:
- Use specific exception types for different error categories
- Implement retry logic with exponential backoff for transient failures
- Log errors comprehensively for debugging and monitoring
- Design fallback mechanisms for critical data extraction
- Monitor resource usage to prevent memory-related issues
- Test error scenarios to ensure your handlers work as expected
Remember that robust error handling is not just about catching exceptions—it's about building resilient systems that can adapt to changing conditions and provide meaningful feedback when things go wrong.