What are the best practices for error handling in Python web scraping?
Error handling is crucial for building reliable and robust Python web scraping applications. Web scraping involves many potential failure points including network issues, server errors, rate limiting, and unexpected content changes. This guide covers comprehensive error handling strategies to make your scraping projects more resilient and maintainable.
Understanding Common Web Scraping Errors
1. HTTP Status Code Errors
HTTP errors are the most common issues in web scraping. Here's how to handle them effectively:
import requests
from requests.exceptions import HTTPError, RequestException
import time
import logging
def fetch_page_with_error_handling(url, max_retries=3):
"""
Fetch a web page with comprehensive HTTP error handling
"""
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
# Check for HTTP errors
if response.status_code == 200:
return response
elif response.status_code == 429:
# Rate limiting - wait and retry
wait_time = 2 ** attempt # Exponential backoff
logging.warning(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
continue
elif response.status_code == 404:
logging.error(f"Page not found: {url}")
return None
elif response.status_code >= 500:
# Server error - retry
logging.warning(f"Server error {response.status_code}. Retrying...")
time.sleep(2 ** attempt)
continue
else:
# Other client errors
logging.error(f"HTTP {response.status_code}: {url}")
return None
except HTTPError as e:
logging.error(f"HTTP Error: {e}")
if attempt == max_retries - 1:
raise
except RequestException as e:
logging.error(f"Request failed: {e}")
if attempt == max_retries - 1:
raise
# Wait before retry
time.sleep(1)
return None
2. Network and Connection Errors
Network issues are inevitable in web scraping. Implement robust connection error handling:
from requests.exceptions import (
ConnectionError,
Timeout,
TooManyRedirects,
RequestException
)
import socket
def robust_request(url, session=None, **kwargs):
"""
Make HTTP request with comprehensive network error handling
"""
if session is None:
session = requests.Session()
# Set default timeout
kwargs.setdefault('timeout', (10, 30)) # (connect, read)
try:
response = session.get(url, **kwargs)
return response
except ConnectionError as e:
logging.error(f"Connection error for {url}: {e}")
raise
except Timeout as e:
logging.error(f"Timeout for {url}: {e}")
raise
except TooManyRedirects as e:
logging.error(f"Too many redirects for {url}: {e}")
raise
except socket.gaierror as e:
logging.error(f"DNS resolution failed for {url}: {e}")
raise
except RequestException as e:
logging.error(f"Request exception for {url}: {e}")
raise
Implementing Retry Logic with Exponential Backoff
Retry logic is essential for handling transient failures:
import random
from functools import wraps
def retry_with_backoff(max_retries=3, backoff_factor=1, max_backoff=60):
"""
Decorator for implementing retry logic with exponential backoff
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except (ConnectionError, Timeout, HTTPError) as e:
if attempt == max_retries - 1:
raise
# Calculate backoff time with jitter
backoff_time = min(
backoff_factor * (2 ** attempt) + random.uniform(0, 1),
max_backoff
)
logging.warning(
f"Attempt {attempt + 1} failed: {e}. "
f"Retrying in {backoff_time:.2f} seconds..."
)
time.sleep(backoff_time)
return None
return wrapper
return decorator
# Usage example
@retry_with_backoff(max_retries=5, backoff_factor=2)
def scrape_with_retry(url):
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
Parsing and Data Extraction Error Handling
Handle errors during HTML parsing and data extraction:
from bs4 import BeautifulSoup
import json
def safe_extract_data(html_content):
"""
Safely extract data with error handling for missing elements
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Safe text extraction with fallbacks
title = safe_get_text(soup, 'h1.title', 'No title found')
price = safe_get_attribute(soup, '.price', 'data-price', 'N/A')
description = safe_get_text(soup, '.description', 'No description')
# Safe JSON extraction
script_data = safe_extract_json(soup, 'script[type="application/ld+json"]')
return {
'title': title,
'price': price,
'description': description,
'structured_data': script_data
}
except Exception as e:
logging.error(f"Error parsing HTML: {e}")
return None
def safe_get_text(soup, selector, default=""):
"""
Safely extract text from element with fallback
"""
try:
element = soup.select_one(selector)
return element.get_text(strip=True) if element else default
except (AttributeError, TypeError) as e:
logging.warning(f"Error extracting text from {selector}: {e}")
return default
def safe_get_attribute(soup, selector, attribute, default=""):
"""
Safely extract attribute value with fallback
"""
try:
element = soup.select_one(selector)
return element.get(attribute, default) if element else default
except (AttributeError, TypeError) as e:
logging.warning(f"Error extracting {attribute} from {selector}: {e}")
return default
def safe_extract_json(soup, selector):
"""
Safely extract and parse JSON from script tags
"""
try:
script_tag = soup.select_one(selector)
if script_tag and script_tag.string:
return json.loads(script_tag.string)
except (json.JSONDecodeError, AttributeError) as e:
logging.warning(f"Error parsing JSON from {selector}: {e}")
return {}
Circuit Breaker Pattern for Rate Limiting
Implement circuit breaker pattern to handle systematic failures:
import time
from enum import Enum
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing fast
HALF_OPEN = "half_open" # Testing if service recovered
class CircuitBreaker:
def __init__(self, failure_threshold=5, recovery_timeout=60, expected_exception=Exception):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.expected_exception = expected_exception
self.failure_count = 0
self.last_failure_time = None
self.state = CircuitState.CLOSED
def call(self, func, *args, **kwargs):
if self.state == CircuitState.OPEN:
if self._should_attempt_reset():
self.state = CircuitState.HALF_OPEN
else:
raise Exception("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
self._on_success()
return result
except self.expected_exception as e:
self._on_failure()
raise e
def _should_attempt_reset(self):
return (
time.time() - self.last_failure_time >= self.recovery_timeout
)
def _on_success(self):
self.failure_count = 0
self.state = CircuitState.CLOSED
def _on_failure(self):
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
# Usage example
circuit_breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=30)
def protected_scrape(url):
return circuit_breaker.call(requests.get, url, timeout=10)
Logging and Monitoring Best Practices
Implement comprehensive logging for debugging and monitoring:
import logging
from datetime import datetime
import sys
def setup_scraping_logger(log_level=logging.INFO):
"""
Set up comprehensive logging for web scraping
"""
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(formatter)
# File handler
file_handler = logging.FileHandler(
f'scraping_{datetime.now().strftime("%Y%m%d")}.log'
)
file_handler.setFormatter(formatter)
# Configure logger
logger = logging.getLogger('web_scraper')
logger.setLevel(log_level)
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
class ScrapingMetrics:
"""
Track scraping metrics for monitoring
"""
def __init__(self):
self.requests_made = 0
self.successful_requests = 0
self.failed_requests = 0
self.error_types = {}
self.start_time = time.time()
def record_success(self):
self.requests_made += 1
self.successful_requests += 1
def record_failure(self, error_type):
self.requests_made += 1
self.failed_requests += 1
self.error_types[error_type] = self.error_types.get(error_type, 0) + 1
def get_stats(self):
duration = time.time() - self.start_time
success_rate = (self.successful_requests / self.requests_made * 100
if self.requests_made > 0 else 0)
return {
'total_requests': self.requests_made,
'success_rate': f"{success_rate:.1f}%",
'duration': f"{duration:.1f}s",
'errors_by_type': self.error_types
}
Complete Example: Robust Web Scraper
Here's a complete example that combines all best practices:
import requests
from bs4 import BeautifulSoup
import time
import logging
from urllib.parse import urljoin, urlparse
class RobustWebScraper:
def __init__(self, base_url, delay=1, max_retries=3):
self.base_url = base_url
self.delay = delay
self.max_retries = max_retries
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)'
})
self.metrics = ScrapingMetrics()
self.logger = setup_scraping_logger()
def scrape_page(self, url):
"""
Scrape a single page with comprehensive error handling
"""
try:
# Fetch page content
html_content = self._fetch_with_retry(url)
if not html_content:
return None
# Parse and extract data
data = self._extract_data(html_content, url)
self.metrics.record_success()
self.logger.info(f"Successfully scraped: {url}")
# Respect rate limiting
time.sleep(self.delay)
return data
except Exception as e:
self.metrics.record_failure(type(e).__name__)
self.logger.error(f"Failed to scrape {url}: {e}")
return None
def _fetch_with_retry(self, url):
"""
Fetch page content with retry logic
"""
for attempt in range(self.max_retries):
try:
response = self.session.get(url, timeout=(10, 30))
if response.status_code == 200:
return response.text
elif response.status_code == 429:
wait_time = 2 ** attempt * 5
self.logger.warning(f"Rate limited. Waiting {wait_time}s")
time.sleep(wait_time)
continue
else:
self.logger.error(f"HTTP {response.status_code}: {url}")
return None
except requests.RequestException as e:
if attempt == self.max_retries - 1:
raise
self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
time.sleep(2 ** attempt)
return None
def _extract_data(self, html_content, url):
"""
Extract data with error handling
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
return {
'url': url,
'title': safe_get_text(soup, 'title'),
'headings': [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2'])],
'links': self._extract_links(soup, url),
'scraped_at': time.time()
}
except Exception as e:
self.logger.error(f"Error extracting data from {url}: {e}")
return None
def _extract_links(self, soup, base_url):
"""
Extract and normalize links
"""
links = []
for link in soup.find_all('a', href=True):
try:
absolute_url = urljoin(base_url, link['href'])
if self._is_valid_url(absolute_url):
links.append(absolute_url)
except Exception as e:
self.logger.warning(f"Error processing link {link.get('href')}: {e}")
return links
def _is_valid_url(self, url):
"""
Validate URL format
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except Exception:
return False
def get_metrics(self):
"""
Get scraping metrics
"""
return self.metrics.get_stats()
# Usage example
if __name__ == "__main__":
scraper = RobustWebScraper("https://example.com", delay=2)
urls_to_scrape = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
results = []
for url in urls_to_scrape:
result = scraper.scrape_page(url)
if result:
results.append(result)
print("Scraping completed!")
print("Metrics:", scraper.get_metrics())
Handling Rate Limiting and Anti-Bot Measures
When dealing with sophisticated websites, you may need additional strategies. For complex scenarios involving JavaScript-heavy sites, consider using tools like Puppeteer for handling dynamic content or implementing proper timeout management.
import random
def add_human_like_delays():
"""
Add randomized delays to mimic human behavior
"""
base_delay = random.uniform(1, 3)
variation = random.uniform(-0.5, 0.5)
return max(0.1, base_delay + variation)
def rotate_user_agents():
"""
Rotate user agents to avoid detection
"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
return random.choice(user_agents)
Conclusion
Implementing robust error handling in Python web scraping is essential for building reliable and maintainable applications. By following these best practices—including proper exception handling, retry logic, circuit breaker patterns, and comprehensive logging—you can create scrapers that gracefully handle failures and provide valuable insights into their operation.
Remember to always respect website terms of service, implement appropriate delays, and consider using professional scraping services when dealing with complex or large-scale requirements. For advanced scenarios involving JavaScript-heavy websites, exploring browser automation tools can provide additional reliability and capability.