What are the common HTTP status codes I should handle when scraping?
Understanding and properly handling HTTP status codes is crucial for building robust and reliable web scraping applications. HTTP status codes provide valuable information about the success or failure of your requests, helping you implement appropriate error handling, retry logic, and data validation strategies.
2xx Success Status Codes
200 OK
The most common success status code indicating that the request was successful and the server has returned the requested data.
import requests
response = requests.get('https://example.com')
if response.status_code == 200:
print("Success! Scraping the content...")
content = response.text
else:
print(f"Request failed with status: {response.status_code}")
// Using fetch API
fetch('https://example.com')
.then(response => {
if (response.status === 200) {
console.log("Success! Processing content...");
return response.text();
} else {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
})
.catch(error => console.error('Error:', error));
201 Created
Indicates successful resource creation, commonly encountered when scraping APIs or submitting forms.
204 No Content
The request was successful, but there's no content to return. This is important for API endpoints that confirm actions without returning data.
3xx Redirection Status Codes
301 Moved Permanently
The resource has been permanently moved to a new URL. Most scraping libraries handle this automatically, but you should be aware of it for logging and URL management.
import requests
# Requests follows redirects by default
response = requests.get('https://example.com/old-page')
print(f"Final URL: {response.url}")
print(f"Status: {response.status_code}")
# To disable redirects
response = requests.get('https://example.com/old-page', allow_redirects=False)
if response.status_code == 301:
new_url = response.headers['Location']
print(f"Permanent redirect to: {new_url}")
302 Found (Temporary Redirect)
Indicates a temporary redirect. The original URL should be used for future requests.
304 Not Modified
Used with caching mechanisms. If you're implementing conditional requests, this indicates the content hasn't changed since your last request.
import requests
# Using If-Modified-Since header
headers = {'If-Modified-Since': 'Wed, 21 Oct 2015 07:28:00 GMT'}
response = requests.get('https://example.com', headers=headers)
if response.status_code == 304:
print("Content not modified, using cached version")
else:
print("Content updated, processing new data")
4xx Client Error Status Codes
400 Bad Request
The server cannot process the request due to client error (malformed syntax, invalid parameters).
import requests
import time
def scrape_with_retry(url, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.get(url)
if response.status_code == 400:
print("Bad request - check URL and parameters")
return None
elif response.status_code == 200:
return response.text
else:
print(f"Attempt {attempt + 1}: Status {response.status_code}")
except requests.RequestException as e:
print(f"Request error: {e}")
time.sleep(2 ** attempt) # Exponential backoff
return None
401 Unauthorized
Authentication is required. You need to provide valid credentials.
import requests
# Basic authentication
response = requests.get('https://api.example.com/data',
auth=('username', 'password'))
if response.status_code == 401:
print("Authentication failed - check credentials")
elif response.status_code == 200:
data = response.json()
403 Forbidden
The server understands the request but refuses to authorize it. This often indicates: - IP blocking - Rate limiting - Insufficient permissions - Bot detection
import requests
import random
import time
def handle_forbidden_error(url):
# Common strategies for 403 errors
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
headers = {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
response = requests.get(url, headers=headers)
if response.status_code == 403:
print("Still forbidden - may need proxy or different approach")
time.sleep(60) # Wait before retry
return response
404 Not Found
The requested resource doesn't exist. Important for handling broken links and outdated URLs.
async function scrapeWithErrorHandling(url) {
try {
const response = await fetch(url);
switch(response.status) {
case 200:
return await response.text();
case 404:
console.log(`Page not found: ${url}`);
return null;
case 403:
console.log(`Access forbidden: ${url}`);
// Implement retry with different headers
break;
case 429:
console.log(`Rate limited: ${url}`);
// Implement exponential backoff
break;
default:
console.log(`Unexpected status: ${response.status}`);
}
} catch (error) {
console.error(`Network error: ${error.message}`);
}
return null;
}
429 Too Many Requests
Rate limiting is in effect. You need to slow down your requests.
import requests
import time
from datetime import datetime, timedelta
class RateLimitedScraper:
def __init__(self, delay=1):
self.delay = delay
self.last_request_time = None
def get(self, url):
# Ensure minimum delay between requests
if self.last_request_time:
elapsed = time.time() - self.last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
response = requests.get(url)
self.last_request_time = time.time()
if response.status_code == 429:
# Check Retry-After header
retry_after = response.headers.get('Retry-After')
if retry_after:
wait_time = int(retry_after)
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
return self.get(url) # Retry after waiting
else:
# Default exponential backoff
wait_time = self.delay * 2
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
self.delay = min(wait_time, 60) # Cap at 60 seconds
return self.get(url)
return response
# Usage
scraper = RateLimitedScraper(delay=2)
response = scraper.get('https://api.example.com/data')
5xx Server Error Status Codes
500 Internal Server Error
A generic server error. Often temporary and worth retrying.
502 Bad Gateway
The server received an invalid response from an upstream server. Common with CDNs and load balancers.
503 Service Unavailable
The server is temporarily unavailable, often due to maintenance or overload.
504 Gateway Timeout
The server didn't receive a response from an upstream server within the timeout period.
import requests
import time
import random
def robust_scraper(url, max_retries=5):
retryable_codes = [500, 502, 503, 504]
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=30)
if response.status_code == 200:
return response.text
elif response.status_code in retryable_codes:
wait_time = (2 ** attempt) + random.uniform(0, 1)
print(f"Server error {response.status_code}. Retrying in {wait_time:.1f}s...")
time.sleep(wait_time)
else:
print(f"Non-retryable error: {response.status_code}")
break
except requests.Timeout:
print(f"Timeout on attempt {attempt + 1}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
except requests.RequestException as e:
print(f"Request exception: {e}")
break
return None
Comprehensive Error Handling Strategy
When dealing with complex authentication flows or handling errors in automated scraping, implementing a comprehensive status code handling strategy is essential:
import requests
import time
import logging
from enum import Enum
class ScrapingResult(Enum):
SUCCESS = "success"
RETRY = "retry"
SKIP = "skip"
ABORT = "abort"
class StatusCodeHandler:
def __init__(self):
self.logger = logging.getLogger(__name__)
def handle_status_code(self, response):
status_code = response.status_code
# Success codes
if 200 <= status_code < 300:
return ScrapingResult.SUCCESS
# Redirection codes
elif 300 <= status_code < 400:
self.logger.info(f"Redirect detected: {status_code}")
return ScrapingResult.SUCCESS # Most libraries handle automatically
# Client error codes
elif status_code == 400:
self.logger.error("Bad request - check parameters")
return ScrapingResult.SKIP
elif status_code == 401:
self.logger.error("Authentication required")
return ScrapingResult.ABORT
elif status_code == 403:
self.logger.warning("Access forbidden - possible bot detection")
return ScrapingResult.RETRY
elif status_code == 404:
self.logger.warning("Page not found")
return ScrapingResult.SKIP
elif status_code == 429:
self.logger.warning("Rate limited")
return ScrapingResult.RETRY
# Server error codes
elif 500 <= status_code < 600:
self.logger.warning(f"Server error: {status_code}")
return ScrapingResult.RETRY
else:
self.logger.error(f"Unexpected status code: {status_code}")
return ScrapingResult.SKIP
# Usage example
handler = StatusCodeHandler()
response = requests.get('https://example.com')
result = handler.handle_status_code(response)
if result == ScrapingResult.SUCCESS:
# Process the content
data = response.text
elif result == ScrapingResult.RETRY:
# Implement retry logic
pass
elif result == ScrapingResult.SKIP:
# Log and move to next URL
pass
elif result == ScrapingResult.ABORT:
# Stop scraping entirely
pass
Best Practices for Status Code Handling
1. Implement Exponential Backoff
For retryable errors, use exponential backoff to avoid overwhelming the server:
def exponential_backoff_retry(func, max_retries=5, base_delay=1):
for attempt in range(max_retries):
try:
result = func()
if result:
return result
except Exception as e:
if attempt == max_retries - 1:
raise e
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
time.sleep(min(delay, 60)) # Cap at 60 seconds
2. Log Status Codes for Analysis
Keep track of status codes to identify patterns and optimize your scraping strategy:
import collections
from datetime import datetime
class StatusCodeTracker:
def __init__(self):
self.status_counts = collections.defaultdict(int)
self.status_log = []
def record_status(self, url, status_code):
self.status_counts[status_code] += 1
self.status_log.append({
'url': url,
'status_code': status_code,
'timestamp': datetime.now()
})
def get_stats(self):
total_requests = sum(self.status_counts.values())
return {
'total_requests': total_requests,
'status_distribution': dict(self.status_counts),
'success_rate': self.status_counts[200] / total_requests if total_requests > 0 else 0
}
3. Respect Retry-After Headers
When encountering 429 or 503 status codes, check for Retry-After headers:
def handle_retry_after(response):
if response.status_code in [429, 503]:
retry_after = response.headers.get('Retry-After')
if retry_after:
try:
# Retry-After can be in seconds or HTTP date
wait_time = int(retry_after)
except ValueError:
# Parse HTTP date format
from email.utils import parsedate_to_datetime
retry_date = parsedate_to_datetime(retry_after)
wait_time = (retry_date - datetime.now()).total_seconds()
print(f"Waiting {wait_time} seconds as requested by server")
time.sleep(max(0, wait_time))
return True
return False
Conclusion
Proper HTTP status code handling is fundamental to successful web scraping. By understanding the meaning of different status codes and implementing appropriate response strategies, you can build more reliable and respectful scraping applications. Remember to always respect rate limits, implement proper retry logic, and monitor your scraping operations for optimal performance.
The key is to treat different status codes appropriately: retry server errors and rate limits, skip client errors like 404, and abort on authentication issues. Combined with proper logging and monitoring, this approach will help you build robust scraping systems that can handle the various challenges of web scraping at scale.