How do I handle HTTP redirects with custom logic in Requests?
HTTP redirects are a fundamental part of web communication, but sometimes you need more control over how they're handled than the default behavior provides. Python's Requests library offers several ways to implement custom redirect logic, from simple redirect prevention to sophisticated conditional handling based on URLs, status codes, or response headers.
Understanding Default Redirect Behavior
By default, Requests automatically follows redirects for GET, OPTIONS, HEAD, POST, PUT, PATCH, and DELETE requests. However, there are scenarios where you might want to:
- Limit the number of redirects
- Inspect redirect URLs before following them
- Conditionally follow redirects based on domain or URL patterns
- Log or analyze redirect chains
- Prevent redirects entirely for security reasons
Disabling Automatic Redirects
The simplest form of custom redirect handling is to disable automatic redirects entirely:
import requests
response = requests.get('https://httpbin.org/redirect/3', allow_redirects=False)
print(f"Status Code: {response.status_code}")
print(f"Location Header: {response.headers.get('Location')}")
# Output:
# Status Code: 302
# Location Header: /relative/redirect/2
With allow_redirects=False
, Requests stops at the first redirect response, allowing you to examine the redirect location and decide whether to follow it.
Implementing Custom Redirect Logic with Sessions
For more sophisticated redirect handling, you can use a Session object with custom logic:
import requests
from urllib.parse import urljoin, urlparse
class CustomRedirectSession(requests.Session):
def __init__(self, max_redirects=5, allowed_domains=None):
super().__init__()
self.max_redirects = max_redirects
self.allowed_domains = allowed_domains or []
self.redirect_count = 0
def request(self, method, url, **kwargs):
self.redirect_count = 0
kwargs['allow_redirects'] = False
response = super().request(method, url, **kwargs)
while self._should_redirect(response) and self.redirect_count < self.max_redirects:
next_url = self._get_redirect_url(response, url)
if not self._is_allowed_redirect(next_url):
print(f"Redirect to {next_url} blocked by custom logic")
break
print(f"Following redirect {self.redirect_count + 1}: {next_url}")
self.redirect_count += 1
url = next_url
response = super().request(method, url, **kwargs)
return response
def _should_redirect(self, response):
return response.status_code in (301, 302, 303, 307, 308)
def _get_redirect_url(self, response, current_url):
location = response.headers.get('Location')
if location:
return urljoin(current_url, location)
return None
def _is_allowed_redirect(self, url):
if not self.allowed_domains:
return True
parsed_url = urlparse(url)
return parsed_url.netloc in self.allowed_domains
# Usage example
session = CustomRedirectSession(
max_redirects=3,
allowed_domains=['httpbin.org', 'example.com']
)
response = session.get('https://httpbin.org/redirect/5')
print(f"Final URL: {response.url}")
print(f"Total redirects followed: {session.redirect_count}")
Advanced Redirect Filtering
You can implement more sophisticated filtering logic based on various criteria:
import requests
import re
from urllib.parse import urlparse
def custom_redirect_handler(url, max_redirects=10, url_patterns=None,
status_codes=None, header_filters=None):
"""
Handle redirects with custom filtering logic
Args:
url: Initial URL to request
max_redirects: Maximum number of redirects to follow
url_patterns: List of regex patterns for allowed URLs
status_codes: List of status codes to follow (default: [301, 302, 303, 307, 308])
header_filters: Dict of header filters (e.g., {'content-type': 'text/html'})
"""
if status_codes is None:
status_codes = [301, 302, 303, 307, 308]
redirect_chain = []
current_url = url
for redirect_num in range(max_redirects + 1):
response = requests.get(current_url, allow_redirects=False)
redirect_chain.append({
'url': current_url,
'status_code': response.status_code,
'headers': dict(response.headers)
})
# Check if this is a redirect response
if response.status_code not in status_codes:
break
# Get redirect location
location = response.headers.get('Location')
if not location:
print("Redirect response without Location header")
break
# Resolve relative URLs
next_url = requests.compat.urljoin(current_url, location)
# Apply URL pattern filtering
if url_patterns and not any(re.search(pattern, next_url) for pattern in url_patterns):
print(f"Redirect to {next_url} blocked by URL pattern filter")
break
# Apply header filtering
if header_filters:
if not all(response.headers.get(key, '').lower().startswith(value.lower())
for key, value in header_filters.items()):
print(f"Redirect blocked by header filter")
break
print(f"Following redirect {redirect_num + 1}: {next_url}")
current_url = next_url
if redirect_num == max_redirects:
print(f"Maximum redirects ({max_redirects}) reached")
break
return {
'final_response': response,
'redirect_chain': redirect_chain,
'total_redirects': len(redirect_chain) - 1
}
# Example usage with URL pattern filtering
result = custom_redirect_handler(
'https://httpbin.org/redirect/3',
max_redirects=5,
url_patterns=[r'https://httpbin\.org/.*'], # Only allow httpbin.org redirects
header_filters={'content-type': 'application/json'}
)
print(f"Final status: {result['final_response'].status_code}")
print(f"Redirects followed: {result['total_redirects']}")
Conditional Redirects Based on Response Content
Sometimes you need to examine the redirect response before deciding whether to follow it:
import requests
from urllib.parse import urljoin
def conditional_redirect_handler(url, content_checker=None):
"""
Follow redirects only if they meet certain content criteria
Args:
url: Initial URL
content_checker: Function that takes response and returns bool
"""
current_url = url
redirect_count = 0
max_redirects = 10
while redirect_count < max_redirects:
response = requests.get(current_url, allow_redirects=False)
# If not a redirect, we're done
if response.status_code not in [301, 302, 303, 307, 308]:
return response
# Check if we should follow this redirect
if content_checker and not content_checker(response):
print(f"Redirect blocked by content checker")
return response
# Get next URL
location = response.headers.get('Location')
if not location:
return response
next_url = urljoin(current_url, location)
print(f"Following redirect to: {next_url}")
current_url = next_url
redirect_count += 1
return response
# Example content checker function
def safe_redirect_checker(response):
"""Only follow redirects that don't indicate errors"""
content_type = response.headers.get('content-type', '').lower()
# Don't follow redirects with error pages
if 'text/html' in content_type and response.status_code == 302:
# You could parse the HTML to check for error indicators
return True
return True
response = conditional_redirect_handler(
'https://httpbin.org/redirect/2',
content_checker=safe_redirect_checker
)
Handling Redirects with Authentication
When dealing with authenticated requests, you might need special redirect handling to preserve credentials:
import requests
from requests.auth import HTTPBasicAuth
def authenticated_redirect_handler(url, auth, preserve_auth_domains=None):
"""
Handle redirects while managing authentication credentials
Args:
url: Initial URL
auth: Authentication object
preserve_auth_domains: List of domains where auth should be preserved
"""
preserve_auth_domains = preserve_auth_domains or []
current_url = url
redirect_count = 0
max_redirects = 5
while redirect_count < max_redirects:
# Determine if we should include auth for this request
should_include_auth = any(domain in current_url for domain in preserve_auth_domains)
request_auth = auth if should_include_auth else None
response = requests.get(current_url, auth=request_auth, allow_redirects=False)
if response.status_code not in [301, 302, 303, 307, 308]:
return response
location = response.headers.get('Location')
if not location:
return response
next_url = requests.compat.urljoin(current_url, location)
print(f"Redirect to: {next_url} (auth: {should_include_auth})")
current_url = next_url
redirect_count += 1
return response
# Usage example
auth = HTTPBasicAuth('username', 'password')
response = authenticated_redirect_handler(
'https://httpbin.org/redirect/2',
auth=auth,
preserve_auth_domains=['httpbin.org', 'trusted-domain.com']
)
Best Practices for Custom Redirect Handling
1. Always Set Redirect Limits
Prevent infinite redirect loops by setting reasonable limits:
# Bad: No redirect limit
response = requests.get(url, allow_redirects=True)
# Good: Explicit redirect limit
response = requests.get(url, allow_redirects=True,
timeout=30, max_redirects=10)
2. Validate Redirect URLs
Always validate redirect destinations to prevent security issues:
def is_safe_redirect(url, allowed_schemes=['http', 'https']):
"""Validate that a redirect URL is safe to follow"""
parsed = urlparse(url)
return parsed.scheme in allowed_schemes
3. Log Redirect Chains
For debugging and monitoring, always log redirect activity:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def log_redirects(url):
"""Log all redirects in a chain"""
redirect_chain = []
current_url = url
while True:
response = requests.get(current_url, allow_redirects=False)
redirect_chain.append((current_url, response.status_code))
if response.status_code not in [301, 302, 303, 307, 308]:
break
location = response.headers.get('Location')
if not location:
break
current_url = requests.compat.urljoin(current_url, location)
logger.info(f"Redirect: {response.status_code} -> {current_url}")
return redirect_chain
Integration with Web Scraping Workflows
Custom redirect handling is particularly useful in web scraping scenarios where you need to handle page redirections in Puppeteer or when building comprehensive monitoring network requests in Puppeteer solutions.
Common Use Cases
- Security: Preventing redirects to untrusted domains
- Performance: Limiting redirect chains to avoid excessive requests
- Compliance: Logging all redirect activity for audit purposes
- Debugging: Understanding complex redirect flows
- Authentication: Managing credentials across redirect boundaries
Conclusion
Custom redirect handling in Python Requests provides powerful control over HTTP redirect behavior. Whether you need simple redirect prevention or sophisticated conditional logic, the techniques shown here allow you to implement robust, secure, and efficient redirect handling tailored to your specific requirements.
Remember to always validate redirect destinations, set appropriate limits, and log redirect activity for debugging and security purposes. These practices will help you build reliable web scraping and API integration solutions that handle redirects gracefully and securely.