What are the common HTTP errors in Python web scraping and how do I handle them?
HTTP errors are an inevitable part of web scraping. Whether you're dealing with missing pages, rate limits, or server issues, understanding how to handle these errors gracefully is crucial for building robust scraping applications. This guide covers the most common HTTP errors you'll encounter in Python web scraping and provides practical solutions for handling them.
Common HTTP Status Codes in Web Scraping
1. 404 Not Found
The 404 error occurs when the requested resource doesn't exist on the server. This is common when scraping dynamic URLs or when pages have been moved or deleted.
import requests
from requests.exceptions import HTTPError
def handle_404_error(url):
try:
response = requests.get(url)
response.raise_for_status() # Raises HTTPError for bad responses
return response
except HTTPError as e:
if response.status_code == 404:
print(f"Page not found: {url}")
return None
else:
raise e
# Example usage
url = "https://example.com/non-existent-page"
result = handle_404_error(url)
2. 403 Forbidden
A 403 error indicates that the server understood the request but refuses to authorize it. This often happens when websites detect automated scraping behavior.
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def handle_403_with_retry(url, max_retries=3):
session = requests.Session()
# Add user agent to appear more like a real browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
session.headers.update(headers)
for attempt in range(max_retries):
try:
response = session.get(url)
if response.status_code == 403:
print(f"Access forbidden (attempt {attempt + 1}). Waiting before retry...")
time.sleep(2 ** attempt) # Exponential backoff
continue
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
if attempt == max_retries - 1:
raise
return None
3. 429 Too Many Requests
The 429 status code indicates rate limiting. This is crucial to handle properly to avoid being permanently blocked.
import requests
import time
from requests.exceptions import HTTPError
def handle_rate_limiting(url, max_retries=5):
for attempt in range(max_retries):
try:
response = requests.get(url)
if response.status_code == 429:
# Check for Retry-After header
retry_after = response.headers.get('Retry-After')
if retry_after:
wait_time = int(retry_after)
else:
# Use exponential backoff if no Retry-After header
wait_time = (2 ** attempt) + 1
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
continue
response.raise_for_status()
return response
except HTTPError as e:
if attempt == max_retries - 1:
raise
time.sleep(2 ** attempt)
return None
4. 500 Internal Server Error
Server errors (5xx) indicate problems on the server side. These are often temporary and can be resolved with retry logic.
import requests
import time
from requests.exceptions import HTTPError
def handle_server_errors(url, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response
except HTTPError as e:
if 500 <= response.status_code < 600:
if attempt < max_retries - 1:
wait_time = (2 ** attempt) + 1
print(f"Server error {response.status_code}. Retrying in {wait_time} seconds...")
time.sleep(wait_time)
continue
raise e
except requests.exceptions.Timeout:
print(f"Request timeout (attempt {attempt + 1})")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
continue
raise
return None
Comprehensive Error Handling Strategy
Here's a robust error handling class that combines all the above strategies:
import requests
import time
import logging
from requests.exceptions import (
HTTPError, ConnectionError, Timeout,
RequestException, TooManyRedirects
)
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
class RobustScraper:
def __init__(self, max_retries=3, backoff_factor=1, timeout=10):
self.session = requests.Session()
self.max_retries = max_retries
self.backoff_factor = backoff_factor
self.timeout = timeout
# Configure retry strategy
retry_strategy = Retry(
total=max_retries,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
backoff_factor=backoff_factor
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# Set default headers
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Setup logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def get(self, url, **kwargs):
"""
Robust GET request with comprehensive error handling
"""
try:
response = self.session.get(url, timeout=self.timeout, **kwargs)
response.raise_for_status()
return response
except HTTPError as e:
status_code = e.response.status_code
if status_code == 404:
self.logger.warning(f"Page not found: {url}")
return None
elif status_code == 403:
self.logger.warning(f"Access forbidden: {url}")
return None
elif status_code == 429:
self.logger.warning(f"Rate limited: {url}")
return None
else:
self.logger.error(f"HTTP error {status_code}: {url}")
raise
except ConnectionError as e:
self.logger.error(f"Connection error for {url}: {e}")
raise
except Timeout as e:
self.logger.error(f"Timeout error for {url}: {e}")
raise
except TooManyRedirects as e:
self.logger.error(f"Too many redirects for {url}: {e}")
raise
except RequestException as e:
self.logger.error(f"Request exception for {url}: {e}")
raise
# Usage example
scraper = RobustScraper(max_retries=3, backoff_factor=2)
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/non-existent"
]
for url in urls:
try:
response = scraper.get(url)
if response:
print(f"Successfully scraped {url}: {len(response.content)} bytes")
except Exception as e:
print(f"Failed to scrape {url}: {e}")
Handling Specific Error Scenarios
Proxy-Related Errors
When using proxies, you might encounter additional connection issues:
import requests
from requests.exceptions import ProxyError, ConnectTimeout
def scrape_with_proxy(url, proxy_list):
for proxy in proxy_list:
try:
proxies = {
'http': f'http://{proxy}',
'https': f'https://{proxy}'
}
response = requests.get(url, proxies=proxies, timeout=10)
response.raise_for_status()
return response
except (ProxyError, ConnectTimeout) as e:
print(f"Proxy {proxy} failed: {e}")
continue
except Exception as e:
print(f"Unexpected error with proxy {proxy}: {e}")
continue
raise Exception("All proxies failed")
SSL Certificate Errors
Some websites have SSL certificate issues that need special handling:
import requests
from requests.exceptions import SSLError
import urllib3
# Disable SSL warnings (use cautiously)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def handle_ssl_errors(url):
try:
# Try with SSL verification first
response = requests.get(url, verify=True, timeout=10)
response.raise_for_status()
return response
except SSLError as e:
print(f"SSL error encountered: {e}")
try:
# Retry without SSL verification (not recommended for production)
response = requests.get(url, verify=False, timeout=10)
response.raise_for_status()
return response
except Exception as e:
print(f"Request failed even without SSL verification: {e}")
raise
Best Practices for Error Handling
1. Implement Exponential Backoff
Always use exponential backoff when retrying failed requests to avoid overwhelming the server:
import time
import random
def exponential_backoff(attempt, base_delay=1, max_delay=60):
"""Calculate delay with jitter to avoid thundering herd problem"""
delay = min(base_delay * (2 ** attempt), max_delay)
jitter = random.uniform(0, 0.1 * delay)
return delay + jitter
# Usage in retry loop
for attempt in range(max_retries):
try:
response = requests.get(url)
break
except Exception as e:
if attempt < max_retries - 1:
delay = exponential_backoff(attempt)
time.sleep(delay)
2. Monitor Error Rates
Keep track of error rates to identify problematic websites or patterns:
from collections import defaultdict
import time
class ErrorTracker:
def __init__(self):
self.errors = defaultdict(int)
self.total_requests = 0
self.start_time = time.time()
def record_error(self, error_type, url=None):
self.errors[error_type] += 1
def record_request(self):
self.total_requests += 1
def get_error_rate(self):
if self.total_requests == 0:
return 0
return sum(self.errors.values()) / self.total_requests
def print_summary(self):
runtime = time.time() - self.start_time
print(f"Runtime: {runtime:.2f} seconds")
print(f"Total requests: {self.total_requests}")
print(f"Error rate: {self.get_error_rate():.2%}")
for error_type, count in self.errors.items():
print(f" {error_type}: {count}")
3. Respect robots.txt and Rate Limits
Always check the website's robots.txt file and implement appropriate delays between requests. When dealing with complex authentication flows or JavaScript-heavy sites, consider using tools like Puppeteer for handling authentication which can provide more robust error handling for dynamic content.
Connection and Network Errors
Beyond HTTP status codes, you'll encounter various network-level errors:
Connection Timeout Errors
import requests
from requests.exceptions import ConnectTimeout, ReadTimeout
def handle_timeouts(url, connect_timeout=5, read_timeout=30):
try:
response = requests.get(
url,
timeout=(connect_timeout, read_timeout)
)
return response
except ConnectTimeout:
print(f"Connection timeout for {url}")
return None
except ReadTimeout:
print(f"Read timeout for {url}")
return None
DNS Resolution Errors
import requests
from requests.exceptions import ConnectionError
import socket
def handle_dns_errors(url):
try:
response = requests.get(url, timeout=10)
return response
except ConnectionError as e:
if "Name or service not known" in str(e):
print(f"DNS resolution failed for {url}")
elif "Connection refused" in str(e):
print(f"Connection refused by {url}")
else:
print(f"Connection error: {e}")
return None
Advanced Error Handling Patterns
Circuit Breaker Pattern
Implement a circuit breaker to stop making requests to a failing service:
import time
from enum import Enum
class CircuitState(Enum):
CLOSED = 1
OPEN = 2
HALF_OPEN = 3
class CircuitBreaker:
def __init__(self, failure_threshold=5, recovery_timeout=60):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.failure_count = 0
self.last_failure_time = None
self.state = CircuitState.CLOSED
def call(self, func, *args, **kwargs):
if self.state == CircuitState.OPEN:
if time.time() - self.last_failure_time > self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
else:
raise Exception("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
self.on_success()
return result
except Exception as e:
self.on_failure()
raise
def on_success(self):
self.failure_count = 0
self.state = CircuitState.CLOSED
def on_failure(self):
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
# Usage
breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=30)
def make_request(url):
response = requests.get(url, timeout=10)
response.raise_for_status()
return response
try:
response = breaker.call(make_request, "https://example.com")
except Exception as e:
print(f"Request failed: {e}")
Logging and Monitoring
Proper logging is essential for debugging and monitoring your scrapers:
import logging
import sys
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('scraper.log'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
def scrape_with_logging(url):
logger.info(f"Starting scrape for {url}")
start_time = datetime.now()
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
duration = (datetime.now() - start_time).total_seconds()
logger.info(f"Successfully scraped {url} in {duration:.2f}s")
return response
except requests.exceptions.HTTPError as e:
logger.error(f"HTTP error for {url}: {e}")
raise
except requests.exceptions.ConnectionError as e:
logger.error(f"Connection error for {url}: {e}")
raise
except requests.exceptions.Timeout as e:
logger.error(f"Timeout error for {url}: {e}")
raise
except Exception as e:
logger.error(f"Unexpected error for {url}: {e}")
raise
Conclusion
Effective error handling in Python web scraping requires a multi-layered approach. By implementing proper retry logic, exponential backoff, circuit breakers, and comprehensive error logging, you can build resilient scrapers that handle various HTTP errors gracefully. Remember to always respect website terms of service and implement reasonable delays between requests to maintain good web citizenship.
For more complex scenarios involving JavaScript-heavy websites, you might need to consider browser automation tools that can handle timeouts and errors more effectively when dealing with dynamic content loading.
The key is to anticipate errors, handle them gracefully, and always have fallback strategies in place to ensure your scraping operations remain robust and reliable.