When using urllib3
for web scraping, you'll encounter various errors that can interrupt your data collection. Understanding these common errors and how to handle them is crucial for building robust scrapers. Here's a comprehensive guide to the most frequent urllib3 errors and their solutions:
Connection-Related Errors
1. MaxRetryError
The most common error occurs when the maximum number of retries is exceeded. This typically happens due to network issues, server overload, or blocked requests.
Common causes: - Server is temporarily unavailable - Network connectivity issues - Rate limiting by the target server - DNS resolution failures
import urllib3
from urllib3.util.retry import Retry
from urllib3.exceptions import MaxRetryError
# Configure retry strategy
retry_strategy = Retry(
total=3,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
backoff_factor=1
)
http = urllib3.PoolManager(retries=retry_strategy)
try:
response = http.request('GET', 'https://example.com')
print(f"Success: {response.status}")
except MaxRetryError as e:
print(f"Max retries exceeded: {e}")
# Implement fallback or logging
2. NewConnectionError
A specific type of MaxRetryError
that occurs when urllib3 cannot establish a new connection to the target server.
Common causes: - Target URL is unreachable - Firewall blocking the connection - DNS resolution failure - Port is closed or filtered
from urllib3.exceptions import NewConnectionError
try:
response = http.request('GET', 'https://nonexistent-domain.example')
except NewConnectionError as e:
print(f"Connection failed: {e}")
# Log the failed URL for later retry
print(f"Failed URL: {e.url}")
3. ConnectTimeoutError
Occurs when the connection attempt takes longer than the specified timeout.
from urllib3.exceptions import ConnectTimeoutError
try:
response = http.request(
'GET',
'https://example.com',
timeout=urllib3.Timeout(connect=5.0, read=10.0)
)
except ConnectTimeoutError as e:
print("Connection timeout - server may be slow or overloaded")
# Implement retry with longer timeout
SSL/TLS Errors
4. SSLError
SSL errors are common when scraping HTTPS sites with certificate issues or outdated SSL configurations.
Common causes: - Expired or invalid SSL certificates - Hostname mismatch - Unsupported SSL protocol versions - Certificate chain issues
from urllib3.exceptions import SSLError
import urllib3
# Disable SSL warnings (use cautiously)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
try:
# For testing only - disable SSL verification
http = urllib3.PoolManager(cert_reqs='CERT_NONE')
response = http.request('GET', 'https://expired-ssl-site.example')
except SSLError as e:
print(f"SSL error: {e}")
# Log SSL issues for manual review
Production-safe SSL handling:
try:
response = http.request('GET', 'https://example.com')
except SSLError as e:
print(f"SSL verification failed: {e}")
# Don't disable SSL in production - fix the underlying issue
raise
Timeout Errors
5. ReadTimeoutError
Occurs when the server doesn't send data within the specified read timeout period.
from urllib3.exceptions import ReadTimeoutError
try:
response = http.request(
'GET',
'https://slow-api.example.com',
timeout=urllib3.Timeout(read=30.0)
)
except ReadTimeoutError as e:
print("Server response timeout - may be processing large request")
# Implement exponential backoff for retries
import time
time.sleep(2 ** retry_count) # Exponential backoff
6. ConnectTimeoutError
The connection establishment exceeds the connect timeout.
from urllib3.exceptions import ConnectTimeoutError
try:
response = http.request(
'GET',
'https://example.com',
timeout=urllib3.Timeout(connect=10.0)
)
except ConnectTimeoutError:
print("Connection establishment timeout")
Protocol and Parsing Errors
7. ProtocolError
Occurs when there's an HTTP protocol violation or unexpected connection closure.
from urllib3.exceptions import ProtocolError
try:
response = http.request('GET', 'https://example.com')
except ProtocolError as e:
print(f"Protocol error: {e}")
# Often caused by server closing connection unexpectedly
# Retry with fresh connection
8. HeaderParsingError
Raised when urllib3 cannot parse malformed HTTP headers.
from urllib3.exceptions import HeaderParsingError
try:
response = http.request('GET', 'https://malformed-headers.example.com')
except HeaderParsingError as e:
print(f"Invalid headers received: {e}")
# Log the problematic response for debugging
HTTP Status Code Handling
While not exceptions, HTTP error status codes should be handled explicitly:
def handle_response(response):
"""Handle HTTP response with proper error checking"""
if 200 <= response.status < 300:
return response.data
elif response.status == 404:
print("Page not found")
return None
elif response.status == 429:
print("Rate limited - implementing backoff")
raise Exception("Rate limited")
elif response.status >= 500:
print(f"Server error: {response.status}")
raise Exception(f"Server error: {response.status}")
else:
print(f"HTTP error: {response.status}")
return None
# Usage
try:
response = http.request('GET', 'https://example.com')
data = handle_response(response)
except Exception as e:
print(f"Request failed: {e}")
Comprehensive Error Handling Strategy
Here's a robust error handling pattern for web scraping:
import urllib3
from urllib3.exceptions import (
MaxRetryError, NewConnectionError, ConnectTimeoutError,
ReadTimeoutError, SSLError, ProtocolError, HeaderParsingError
)
import time
import logging
class RobustScraper:
def __init__(self):
retry_strategy = urllib3.util.retry.Retry(
total=3,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
backoff_factor=2
)
self.http = urllib3.PoolManager(
retries=retry_strategy,
timeout=urllib3.Timeout(connect=10.0, read=30.0)
)
self.logger = logging.getLogger(__name__)
def scrape_url(self, url, max_retries=3):
"""Scrape URL with comprehensive error handling"""
for attempt in range(max_retries):
try:
response = self.http.request('GET', url)
if 200 <= response.status < 300:
return response.data.decode('utf-8')
elif response.status == 429:
wait_time = 2 ** attempt
self.logger.warning(f"Rate limited. Waiting {wait_time}s")
time.sleep(wait_time)
continue
else:
self.logger.error(f"HTTP {response.status} for {url}")
return None
except (NewConnectionError, ConnectTimeoutError) as e:
self.logger.error(f"Connection failed for {url}: {e}")
if attempt == max_retries - 1:
return None
time.sleep(2 ** attempt)
except ReadTimeoutError:
self.logger.warning(f"Read timeout for {url}, attempt {attempt + 1}")
if attempt == max_retries - 1:
return None
except SSLError as e:
self.logger.error(f"SSL error for {url}: {e}")
return None # Don't retry SSL errors
except (ProtocolError, HeaderParsingError) as e:
self.logger.error(f"Protocol/parsing error for {url}: {e}")
if attempt == max_retries - 1:
return None
time.sleep(1)
except MaxRetryError as e:
self.logger.error(f"Max retries exceeded for {url}: {e}")
return None
return None
# Usage
scraper = RobustScraper()
content = scraper.scrape_url('https://example.com')
Prevention Best Practices
1. Configure Appropriate Timeouts
# Separate connect and read timeouts
timeout = urllib3.Timeout(connect=10.0, read=30.0)
2. Implement Retry Logic with Backoff
from urllib3.util.retry import Retry
retry_strategy = Retry(
total=5,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
backoff_factor=2 # 2, 4, 8, 16 seconds
)
3. Use Connection Pooling Efficiently
# Reuse the same PoolManager instance
http = urllib3.PoolManager(
num_pools=10,
maxsize=10,
retries=retry_strategy
)
4. Monitor and Log Errors
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Log all scraping activities
logger.info(f"Scraping {url}")
logger.error(f"Failed to scrape {url}: {error}")
5. Respect Rate Limits
import time
class RateLimiter:
def __init__(self, calls_per_second=1):
self.calls_per_second = calls_per_second
self.last_call_time = 0
def wait_if_needed(self):
elapsed = time.time() - self.last_call_time
min_interval = 1.0 / self.calls_per_second
if elapsed < min_interval:
time.sleep(min_interval - elapsed)
self.last_call_time = time.time()
# Usage
rate_limiter = RateLimiter(calls_per_second=2)
rate_limiter.wait_if_needed()
By understanding these common errors and implementing robust error handling, you'll build more reliable web scrapers that can handle the unpredictable nature of web scraping. Always remember to scrape responsibly and respect the target website's terms of service.