When using urllib3
, handling redirections is essential for robust web scraping and HTTP requests. The library provides several approaches to manage redirects, from automatic handling to fine-grained control.
Automatic Redirect Handling
By default, urllib3.PoolManager
automatically follows redirects, making it the simplest approach for most use cases:
import urllib3
# PoolManager automatically follows redirects
http = urllib3.PoolManager()
response = http.request('GET', 'http://httpbin.org/redirect/3')
print(f"Final status: {response.status}")
print(f"Final URL: {response.geturl()}")
print(f"Data: {response.data.decode('utf-8')[:100]}...")
Checking Redirect History
You can access redirect information through the response object:
import urllib3
http = urllib3.PoolManager()
response = http.request('GET', 'http://httpbin.org/redirect/2')
# Check if redirects occurred
if response.history:
print(f"Number of redirects: {len(response.history)}")
for i, redirect in enumerate(response.history):
print(f"Redirect {i+1}: {redirect.status} -> {redirect.headers.get('Location')}")
Configuring Redirect Behavior
Limiting Redirects
Control the maximum number of redirects using the Retry
configuration:
import urllib3
from urllib3.util.retry import Retry
# Allow maximum 3 redirects
retry_config = Retry(total=10, redirect=3)
http = urllib3.PoolManager(retries=retry_config)
try:
response = http.request('GET', 'http://httpbin.org/redirect/5')
print(f"Status: {response.status}")
except urllib3.exceptions.MaxRetryError as e:
print(f"Too many redirects: {e}")
Disabling Automatic Redirects
Disable automatic redirects to handle them manually:
import urllib3
from urllib3.util.retry import Retry
# Disable automatic redirects
retry_config = Retry(redirect=False)
http = urllib3.PoolManager(retries=retry_config)
response = http.request('GET', 'http://httpbin.org/redirect/1')
print(f"Status: {response.status}")
print(f"Location header: {response.headers.get('Location')}")
Manual Redirect Handling
For complete control over redirect behavior, implement manual handling:
import urllib3
from urllib3.util.retry import Retry
from urllib3.util.url import parse_url
def follow_redirects_manually(http, url, max_redirects=5):
"""Manually follow redirects with custom logic"""
redirects_followed = 0
current_url = url
while redirects_followed < max_redirects:
response = http.request('GET', current_url)
# Check if it's a redirect status code
if response.status in [301, 302, 303, 307, 308]:
location = response.headers.get('Location')
if not location:
break
print(f"Redirect {redirects_followed + 1}: {response.status} -> {location}")
# Handle relative URLs
if location.startswith('/'):
parsed = parse_url(current_url)
current_url = f"{parsed.scheme}://{parsed.host}{location}"
else:
current_url = location
redirects_followed += 1
else:
# Not a redirect, return the final response
return response, redirects_followed
# Max redirects reached
raise Exception(f"Too many redirects ({max_redirects})")
# Usage
http = urllib3.PoolManager(retries=Retry(redirect=False))
final_response, redirect_count = follow_redirects_manually(
http,
'http://httpbin.org/redirect/3'
)
print(f"Final status: {final_response.status}")
print(f"Total redirects: {redirect_count}")
Handling Different Redirect Types
Different HTTP status codes require different handling approaches:
import urllib3
from urllib3.util.retry import Retry
def handle_redirect_by_type(http, url):
"""Handle redirects based on status code"""
response = http.request('GET', url)
if response.status == 301:
print("Permanent redirect - update bookmarks/cache")
elif response.status == 302:
print("Temporary redirect - don't cache")
elif response.status == 303:
print("See Other - use GET for next request")
elif response.status == 307:
print("Temporary redirect - preserve method")
elif response.status == 308:
print("Permanent redirect - preserve method")
return response
# Disable automatic redirects for custom handling
http = urllib3.PoolManager(retries=Retry(redirect=False))
response = handle_redirect_by_type(http, 'http://httpbin.org/redirect/1')
Security Considerations
When handling redirects, especially manually, consider these security aspects:
import urllib3
from urllib3.util.url import parse_url
def secure_redirect_handler(http, url, allowed_schemes=['http', 'https']):
"""Secure redirect handling with validation"""
visited_urls = set()
current_url = url
for _ in range(5): # Max 5 redirects
if current_url in visited_urls:
raise Exception("Redirect loop detected")
visited_urls.add(current_url)
response = http.request('GET', current_url)
if response.status not in [301, 302, 303, 307, 308]:
return response
location = response.headers.get('Location')
if not location:
break
# Validate redirect URL
parsed = parse_url(location)
if parsed.scheme not in allowed_schemes:
raise Exception(f"Unsafe redirect scheme: {parsed.scheme}")
current_url = location
raise Exception("Too many redirects")
# Usage with security checks
http = urllib3.PoolManager(retries=Retry(redirect=False))
try:
secure_response = secure_redirect_handler(http, 'http://httpbin.org/redirect/2')
print(f"Secure redirect completed: {secure_response.status}")
except Exception as e:
print(f"Redirect error: {e}")
Best Practices
- Use PoolManager for simplicity: It handles most redirect scenarios automatically
- Set reasonable redirect limits: Prevent infinite redirect loops
- Validate redirect URLs: Check for malicious or unexpected redirects
- Preserve important headers: When handling redirects manually, maintain necessary headers
- Handle HTTPS properly: Ensure SSL certificate validation is enabled
import urllib3
# Best practice configuration
http = urllib3.PoolManager(
retries=urllib3.util.retry.Retry(
total=3,
redirect=5,
backoff_factor=0.3
),
cert_reqs='CERT_REQUIRED',
ca_certs=urllib3.util.ssl_.DEFAULT_CIPHERS
)
This comprehensive approach to handling redirects in urllib3 ensures robust and secure HTTP communication in your Python applications.