Robust error handling is crucial when working with urllib3
for web scraping and HTTP requests. Proper error handling ensures your applications can gracefully handle network issues, server errors, and other failures that commonly occur in distributed systems.
Core Error Handling Strategies
1. Implement Comprehensive Retry Logic
The Retry
class in urllib3 provides sophisticated retry mechanisms for handling transient failures:
import urllib3
from urllib3.util.retry import Retry
from urllib3.util import Timeout
# Configure retry strategy
retry_strategy = Retry(
total=3, # Total number of retries
read=3, # Read timeout retries
connect=3, # Connection retries
backoff_factor=1, # Exponential backoff multiplier
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry
allowed_methods=["HEAD", "GET", "OPTIONS"] # HTTP methods to retry
)
# Create pool manager with retry configuration
http = urllib3.PoolManager(
retries=retry_strategy,
timeout=Timeout(connect=5.0, read=10.0),
maxsize=10,
block=True
)
try:
response = http.request('GET', 'https://api.example.com/data')
print(f"Success: {response.status}")
except urllib3.exceptions.MaxRetryError as e:
print(f"Max retries exceeded: {e}")
# Handle permanent failure
except Exception as e:
print(f"Unexpected error: {e}")
2. Handle Specific Exception Types
urllib3 provides specific exceptions for different error conditions. Handle them appropriately:
import urllib3
from urllib3.exceptions import (
HTTPError, MaxRetryError, TimeoutError,
SSLError, ProxyError, ConnectionError,
ReadTimeoutError, ConnectTimeoutError
)
def make_request_with_error_handling(url):
http = urllib3.PoolManager()
try:
response = http.request('GET', url, timeout=10.0)
return response
except ConnectTimeoutError:
print("Connection timed out - server may be down")
return None
except ReadTimeoutError:
print("Read timed out - server response too slow")
return None
except SSLError as e:
print(f"SSL/TLS error: {e}")
return None
except ProxyError as e:
print(f"Proxy error: {e}")
return None
except MaxRetryError as e:
print(f"Max retries exceeded for {url}: {e.reason}")
return None
except HTTPError as e:
print(f"HTTP error: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None
3. Validate HTTP Response Status
Always check response status codes, even when no exceptions are raised:
def handle_response(response):
"""Process HTTP response with proper status code handling"""
if response.status == 200:
return response.data
elif response.status == 404:
print("Resource not found")
return None
elif response.status == 429:
print("Rate limited - implement backoff")
return None
elif 400 <= response.status < 500:
print(f"Client error: {response.status}")
return None
elif 500 <= response.status < 600:
print(f"Server error: {response.status}")
return None
else:
print(f"Unexpected status: {response.status}")
return None
# Usage example
response = http.request('GET', 'https://api.example.com/users/123')
if response:
data = handle_response(response)
if data:
print("Successfully retrieved data")
4. Implement Circuit Breaker Pattern
For high-reliability applications, implement a circuit breaker to prevent cascading failures:
import time
from enum import Enum
class CircuitState(Enum):
CLOSED = "closed"
OPEN = "open"
HALF_OPEN = "half_open"
class CircuitBreaker:
def __init__(self, failure_threshold=5, timeout=60):
self.failure_threshold = failure_threshold
self.timeout = timeout
self.failure_count = 0
self.last_failure_time = None
self.state = CircuitState.CLOSED
def call(self, func, *args, **kwargs):
if self.state == CircuitState.OPEN:
if time.time() - self.last_failure_time > self.timeout:
self.state = CircuitState.HALF_OPEN
else:
raise Exception("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
self.on_success()
return result
except Exception as e:
self.on_failure()
raise e
def on_success(self):
self.failure_count = 0
self.state = CircuitState.CLOSED
def on_failure(self):
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
# Usage with urllib3
circuit_breaker = CircuitBreaker(failure_threshold=3, timeout=30)
def protected_request(url):
http = urllib3.PoolManager()
return http.request('GET', url)
try:
response = circuit_breaker.call(protected_request, 'https://api.example.com/data')
print("Request successful")
except Exception as e:
print(f"Request failed: {e}")
5. Structured Logging and Monitoring
Implement comprehensive logging for debugging and monitoring:
import logging
import json
from datetime import datetime
# Configure structured logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def make_monitored_request(url, method='GET', **kwargs):
"""Make HTTP request with comprehensive logging and monitoring"""
start_time = time.time()
request_id = f"req_{int(start_time * 1000)}"
# Log request start
logger.info(json.dumps({
"event": "request_start",
"request_id": request_id,
"method": method,
"url": url,
"timestamp": datetime.utcnow().isoformat()
}))
http = urllib3.PoolManager()
try:
response = http.request(method, url, **kwargs)
duration = time.time() - start_time
# Log successful response
logger.info(json.dumps({
"event": "request_success",
"request_id": request_id,
"status_code": response.status,
"duration_ms": round(duration * 1000, 2),
"response_size": len(response.data),
"timestamp": datetime.utcnow().isoformat()
}))
return response
except Exception as e:
duration = time.time() - start_time
# Log error with context
logger.error(json.dumps({
"event": "request_error",
"request_id": request_id,
"error_type": type(e).__name__,
"error_message": str(e),
"duration_ms": round(duration * 1000, 2),
"timestamp": datetime.utcnow().isoformat()
}))
raise e
Resource Management Best Practices
6. Proper Connection Management
Always ensure proper cleanup of resources:
import contextlib
@contextlib.contextmanager
def http_pool_manager(**kwargs):
"""Context manager for urllib3 PoolManager"""
pool = urllib3.PoolManager(**kwargs)
try:
yield pool
finally:
pool.clear()
# Usage
with http_pool_manager(maxsize=10, block=True) as http:
response = http.request('GET', 'https://example.com')
# Process response
data = response.data
# Pool automatically cleaned up
# For streaming responses
def stream_response(url):
http = urllib3.PoolManager()
try:
with http.request('GET', url, preload_content=False) as response:
# Stream the response
for chunk in response.stream(1024):
yield chunk
except Exception as e:
logger.error(f"Streaming error: {e}")
raise
finally:
# Connection automatically released by context manager
pass
7. Rate Limiting and Backoff
Implement respectful rate limiting to avoid overwhelming servers:
import time
import random
class RateLimiter:
def __init__(self, requests_per_second=1):
self.requests_per_second = requests_per_second
self.last_request_time = 0
def wait_if_needed(self):
current_time = time.time()
time_since_last = current_time - self.last_request_time
min_interval = 1.0 / self.requests_per_second
if time_since_last < min_interval:
sleep_time = min_interval - time_since_last
time.sleep(sleep_time)
self.last_request_time = time.time()
def exponential_backoff(attempt, base_delay=1, max_delay=60):
"""Calculate exponential backoff with jitter"""
delay = min(base_delay * (2 ** attempt), max_delay)
jitter = random.uniform(0, delay * 0.1) # Add 10% jitter
return delay + jitter
# Usage example
rate_limiter = RateLimiter(requests_per_second=2)
def resilient_request(url, max_attempts=3):
http = urllib3.PoolManager()
for attempt in range(max_attempts):
try:
rate_limiter.wait_if_needed()
response = http.request('GET', url)
if response.status == 429: # Rate limited
delay = exponential_backoff(attempt)
logger.warning(f"Rate limited. Waiting {delay:.2f}s before retry")
time.sleep(delay)
continue
return response
except (TimeoutError, ConnectionError) as e:
if attempt == max_attempts - 1:
raise e
delay = exponential_backoff(attempt)
logger.warning(f"Request failed. Retrying in {delay:.2f}s")
time.sleep(delay)
raise Exception("Max attempts exceeded")
Production-Ready Error Handling
8. Complete Error Handling Class
Here's a comprehensive example combining all best practices:
import urllib3
import logging
import time
import json
from urllib3.util.retry import Retry
from urllib3.exceptions import *
from typing import Optional, Dict, Any
class RobustHttpClient:
def __init__(self, timeout=30, retries=3, rate_limit=2):
self.timeout = timeout
self.rate_limit = rate_limit
self.last_request_time = 0
# Configure retry strategy
retry_strategy = Retry(
total=retries,
status_forcelist=[429, 500, 502, 503, 504],
backoff_factor=1,
raise_on_status=False
)
# Initialize connection pool
self.http = urllib3.PoolManager(
retries=retry_strategy,
timeout=urllib3.Timeout(connect=5.0, read=timeout),
maxsize=10,
block=True
)
# Setup logging
self.logger = logging.getLogger(__name__)
def _rate_limit(self):
"""Implement rate limiting"""
current_time = time.time()
time_since_last = current_time - self.last_request_time
min_interval = 1.0 / self.rate_limit
if time_since_last < min_interval:
sleep_time = min_interval - time_since_last
time.sleep(sleep_time)
self.last_request_time = time.time()
def request(self, method: str, url: str, **kwargs) -> Optional[urllib3.HTTPResponse]:
"""Make HTTP request with comprehensive error handling"""
self._rate_limit()
try:
response = self.http.request(method, url, **kwargs)
# Log request details
self.logger.info(f"{method} {url} -> {response.status}")
# Handle different status codes
if 200 <= response.status < 300:
return response
elif response.status == 404:
self.logger.warning(f"Resource not found: {url}")
return None
elif response.status == 429:
self.logger.warning(f"Rate limited: {url}")
return None
elif 400 <= response.status < 500:
self.logger.error(f"Client error {response.status}: {url}")
return None
elif 500 <= response.status < 600:
self.logger.error(f"Server error {response.status}: {url}")
return None
else:
self.logger.warning(f"Unexpected status {response.status}: {url}")
return response
except MaxRetryError as e:
self.logger.error(f"Max retries exceeded for {url}: {e.reason}")
return None
except (ConnectTimeoutError, ReadTimeoutError) as e:
self.logger.error(f"Timeout error for {url}: {e}")
return None
except SSLError as e:
self.logger.error(f"SSL error for {url}: {e}")
return None
except Exception as e:
self.logger.error(f"Unexpected error for {url}: {e}")
return None
def get(self, url: str, **kwargs) -> Optional[urllib3.HTTPResponse]:
"""GET request wrapper"""
return self.request('GET', url, **kwargs)
def post(self, url: str, **kwargs) -> Optional[urllib3.HTTPResponse]:
"""POST request wrapper"""
return self.request('POST', url, **kwargs)
def close(self):
"""Clean up resources"""
self.http.clear()
# Usage example
client = RobustHttpClient(timeout=30, retries=3, rate_limit=2)
try:
response = client.get('https://api.example.com/data')
if response:
data = response.data.decode('utf-8')
print(f"Success: {len(data)} bytes received")
else:
print("Request failed or returned no data")
finally:
client.close()
By implementing these error handling best practices, you'll create robust applications that can handle the various failure modes common in web scraping and API interactions. Remember to always test your error handling code under different failure scenarios to ensure it behaves as expected.