What are the common HTTP errors in Python web scraping and how do I handle them?

HTTP errors are an inevitable part of web scraping. Whether you're dealing with missing pages, rate limits, or server issues, understanding how to handle these errors gracefully is crucial for building robust scraping applications. This guide covers the most common HTTP errors you'll encounter in Python web scraping and provides practical solutions for handling them.

Common HTTP Status Codes in Web Scraping

1. 404 Not Found

The 404 error occurs when the requested resource doesn't exist on the server. This is common when scraping dynamic URLs or when pages have been moved or deleted.

import requests
from requests.exceptions import HTTPError

def handle_404_error(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response
    except HTTPError as e:
        if response.status_code == 404:
            print(f"Page not found: {url}")
            return None
        else:
            raise e

# Example usage
url = "https://example.com/non-existent-page"
result = handle_404_error(url)

2. 403 Forbidden

A 403 error indicates that the server understood the request but refuses to authorize it. This often happens when websites detect automated scraping behavior.

import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def handle_403_with_retry(url, max_retries=3):
    session = requests.Session()

    # Add user agent to appear more like a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    session.headers.update(headers)

    for attempt in range(max_retries):
        try:
            response = session.get(url)
            if response.status_code == 403:
                print(f"Access forbidden (attempt {attempt + 1}). Waiting before retry...")
                time.sleep(2 ** attempt)  # Exponential backoff
                continue
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            if attempt == max_retries - 1:
                raise

    return None

3. 429 Too Many Requests

The 429 status code indicates rate limiting. This is crucial to handle properly to avoid being permanently blocked.

import requests
import time
from requests.exceptions import HTTPError

def handle_rate_limiting(url, max_retries=5):
    for attempt in range(max_retries):
        try:
            response = requests.get(url)

            if response.status_code == 429:
                # Check for Retry-After header
                retry_after = response.headers.get('Retry-After')
                if retry_after:
                    wait_time = int(retry_after)
                else:
                    # Use exponential backoff if no Retry-After header
                    wait_time = (2 ** attempt) + 1

                print(f"Rate limited. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue

            response.raise_for_status()
            return response

        except HTTPError as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)

    return None

4. 500 Internal Server Error

Server errors (5xx) indicate problems on the server side. These are often temporary and can be resolved with retry logic.

import requests
import time
from requests.exceptions import HTTPError

def handle_server_errors(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return response

        except HTTPError as e:
            if 500 <= response.status_code < 600:
                if attempt < max_retries - 1:
                    wait_time = (2 ** attempt) + 1
                    print(f"Server error {response.status_code}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
            raise e
        except requests.exceptions.Timeout:
            print(f"Request timeout (attempt {attempt + 1})")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            raise

    return None

Comprehensive Error Handling Strategy

Here's a robust error handling class that combines all the above strategies:

import requests
import time
import logging
from requests.exceptions import (
    HTTPError, ConnectionError, Timeout, 
    RequestException, TooManyRedirects
)
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

class RobustScraper:
    def __init__(self, max_retries=3, backoff_factor=1, timeout=10):
        self.session = requests.Session()
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.timeout = timeout

        # Configure retry strategy
        retry_strategy = Retry(
            total=max_retries,
            status_forcelist=[429, 500, 502, 503, 504],
            method_whitelist=["HEAD", "GET", "OPTIONS"],
            backoff_factor=backoff_factor
        )

        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)

        # Set default headers
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def get(self, url, **kwargs):
        """
        Robust GET request with comprehensive error handling
        """
        try:
            response = self.session.get(url, timeout=self.timeout, **kwargs)
            response.raise_for_status()
            return response

        except HTTPError as e:
            status_code = e.response.status_code

            if status_code == 404:
                self.logger.warning(f"Page not found: {url}")
                return None
            elif status_code == 403:
                self.logger.warning(f"Access forbidden: {url}")
                return None
            elif status_code == 429:
                self.logger.warning(f"Rate limited: {url}")
                return None
            else:
                self.logger.error(f"HTTP error {status_code}: {url}")
                raise

        except ConnectionError as e:
            self.logger.error(f"Connection error for {url}: {e}")
            raise

        except Timeout as e:
            self.logger.error(f"Timeout error for {url}: {e}")
            raise

        except TooManyRedirects as e:
            self.logger.error(f"Too many redirects for {url}: {e}")
            raise

        except RequestException as e:
            self.logger.error(f"Request exception for {url}: {e}")
            raise

# Usage example
scraper = RobustScraper(max_retries=3, backoff_factor=2)

urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/non-existent"
]

for url in urls:
    try:
        response = scraper.get(url)
        if response:
            print(f"Successfully scraped {url}: {len(response.content)} bytes")
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

Handling Specific Error Scenarios

Proxy-Related Errors

When using proxies, you might encounter additional connection issues:

import requests
from requests.exceptions import ProxyError, ConnectTimeout

def scrape_with_proxy(url, proxy_list):
    for proxy in proxy_list:
        try:
            proxies = {
                'http': f'http://{proxy}',
                'https': f'https://{proxy}'
            }

            response = requests.get(url, proxies=proxies, timeout=10)
            response.raise_for_status()
            return response

        except (ProxyError, ConnectTimeout) as e:
            print(f"Proxy {proxy} failed: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error with proxy {proxy}: {e}")
            continue

    raise Exception("All proxies failed")

SSL Certificate Errors

Some websites have SSL certificate issues that need special handling:

import requests
from requests.exceptions import SSLError
import urllib3

# Disable SSL warnings (use cautiously)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def handle_ssl_errors(url):
    try:
        # Try with SSL verification first
        response = requests.get(url, verify=True, timeout=10)
        response.raise_for_status()
        return response

    except SSLError as e:
        print(f"SSL error encountered: {e}")
        try:
            # Retry without SSL verification (not recommended for production)
            response = requests.get(url, verify=False, timeout=10)
            response.raise_for_status()
            return response
        except Exception as e:
            print(f"Request failed even without SSL verification: {e}")
            raise

Best Practices for Error Handling

1. Implement Exponential Backoff

Always use exponential backoff when retrying failed requests to avoid overwhelming the server:

import time
import random

def exponential_backoff(attempt, base_delay=1, max_delay=60):
    """Calculate delay with jitter to avoid thundering herd problem"""
    delay = min(base_delay * (2 ** attempt), max_delay)
    jitter = random.uniform(0, 0.1 * delay)
    return delay + jitter

# Usage in retry loop
for attempt in range(max_retries):
    try:
        response = requests.get(url)
        break
    except Exception as e:
        if attempt < max_retries - 1:
            delay = exponential_backoff(attempt)
            time.sleep(delay)

2. Monitor Error Rates

Keep track of error rates to identify problematic websites or patterns:

from collections import defaultdict
import time

class ErrorTracker:
    def __init__(self):
        self.errors = defaultdict(int)
        self.total_requests = 0
        self.start_time = time.time()

    def record_error(self, error_type, url=None):
        self.errors[error_type] += 1

    def record_request(self):
        self.total_requests += 1

    def get_error_rate(self):
        if self.total_requests == 0:
            return 0
        return sum(self.errors.values()) / self.total_requests

    def print_summary(self):
        runtime = time.time() - self.start_time
        print(f"Runtime: {runtime:.2f} seconds")
        print(f"Total requests: {self.total_requests}")
        print(f"Error rate: {self.get_error_rate():.2%}")
        for error_type, count in self.errors.items():
            print(f"  {error_type}: {count}")

3. Respect robots.txt and Rate Limits

Always check the website's robots.txt file and implement appropriate delays between requests. When dealing with complex authentication flows or JavaScript-heavy sites, consider using tools like Puppeteer for handling authentication which can provide more robust error handling for dynamic content.

Connection and Network Errors

Beyond HTTP status codes, you'll encounter various network-level errors:

Connection Timeout Errors

import requests
from requests.exceptions import ConnectTimeout, ReadTimeout

def handle_timeouts(url, connect_timeout=5, read_timeout=30):
    try:
        response = requests.get(
            url, 
            timeout=(connect_timeout, read_timeout)
        )
        return response
    except ConnectTimeout:
        print(f"Connection timeout for {url}")
        return None
    except ReadTimeout:
        print(f"Read timeout for {url}")
        return None

DNS Resolution Errors

import requests
from requests.exceptions import ConnectionError
import socket

def handle_dns_errors(url):
    try:
        response = requests.get(url, timeout=10)
        return response
    except ConnectionError as e:
        if "Name or service not known" in str(e):
            print(f"DNS resolution failed for {url}")
        elif "Connection refused" in str(e):
            print(f"Connection refused by {url}")
        else:
            print(f"Connection error: {e}")
        return None

Advanced Error Handling Patterns

Circuit Breaker Pattern

Implement a circuit breaker to stop making requests to a failing service:

import time
from enum import Enum

class CircuitState(Enum):
    CLOSED = 1
    OPEN = 2
    HALF_OPEN = 3

class CircuitBreaker:
    def __init__(self, failure_threshold=5, recovery_timeout=60):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.failure_count = 0
        self.last_failure_time = None
        self.state = CircuitState.CLOSED

    def call(self, func, *args, **kwargs):
        if self.state == CircuitState.OPEN:
            if time.time() - self.last_failure_time > self.recovery_timeout:
                self.state = CircuitState.HALF_OPEN
            else:
                raise Exception("Circuit breaker is OPEN")

        try:
            result = func(*args, **kwargs)
            self.on_success()
            return result
        except Exception as e:
            self.on_failure()
            raise

    def on_success(self):
        self.failure_count = 0
        self.state = CircuitState.CLOSED

    def on_failure(self):
        self.failure_count += 1
        self.last_failure_time = time.time()

        if self.failure_count >= self.failure_threshold:
            self.state = CircuitState.OPEN

# Usage
breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=30)

def make_request(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response

try:
    response = breaker.call(make_request, "https://example.com")
except Exception as e:
    print(f"Request failed: {e}")

Logging and Monitoring

Proper logging is essential for debugging and monitoring your scrapers:

import logging
import sys
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler(sys.stdout)
    ]
)

logger = logging.getLogger(__name__)

def scrape_with_logging(url):
    logger.info(f"Starting scrape for {url}")
    start_time = datetime.now()

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        duration = (datetime.now() - start_time).total_seconds()
        logger.info(f"Successfully scraped {url} in {duration:.2f}s")
        return response

    except requests.exceptions.HTTPError as e:
        logger.error(f"HTTP error for {url}: {e}")
        raise
    except requests.exceptions.ConnectionError as e:
        logger.error(f"Connection error for {url}: {e}")
        raise
    except requests.exceptions.Timeout as e:
        logger.error(f"Timeout error for {url}: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error for {url}: {e}")
        raise

Conclusion

Effective error handling in Python web scraping requires a multi-layered approach. By implementing proper retry logic, exponential backoff, circuit breakers, and comprehensive error logging, you can build resilient scrapers that handle various HTTP errors gracefully. Remember to always respect website terms of service and implement reasonable delays between requests to maintain good web citizenship.

For more complex scenarios involving JavaScript-heavy websites, you might need to consider browser automation tools that can handle timeouts and errors more effectively when dealing with dynamic content loading.

The key is to anticipate errors, handle them gracefully, and always have fallback strategies in place to ensure your scraping operations remain robust and reliable.

Table of contents