Proper exception handling is essential for building robust web scraping applications with Beautiful Soup. This comprehensive guide covers all the common exceptions you'll encounter and provides practical solutions for handling them effectively.
Common Exception Types in Beautiful Soup
Beautiful Soup web scraping involves multiple layers where exceptions can occur. Understanding these exception types helps you build more resilient scrapers.
1. Network and HTTP Exceptions
These exceptions come from the requests
library when fetching web pages:
import requests
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException
# Common HTTP exceptions
try:
response = requests.get('https://example.com', timeout=10)
response.raise_for_status()
except HTTPError as e:
print(f"HTTP error {e.response.status_code}: {e}")
except ConnectionError:
print("Network connection failed")
except Timeout:
print("Request timed out")
except RequestException as e:
print(f"Request failed: {e}")
2. Beautiful Soup Parsing Exceptions
These occur when parsing HTML content or accessing elements:
from bs4 import BeautifulSoup
# AttributeError when element doesn't exist
try:
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('h1').get_text() # AttributeError if h1 not found
except AttributeError:
title = "No title found"
# IndexError when accessing list elements
try:
paragraphs = soup.find_all('p')
first_paragraph = paragraphs[0].get_text() # IndexError if no paragraphs
except IndexError:
first_paragraph = "No paragraphs found"
3. Parser-Specific Exceptions
Different parsers can raise different exceptions:
from bs4 import BeautifulSoup
from bs4.builder import ParserRejectedMarkup
try:
soup = BeautifulSoup(malformed_html, 'lxml')
except ParserRejectedMarkup:
# Fallback to a more permissive parser
soup = BeautifulSoup(malformed_html, 'html.parser')
Comprehensive Exception Handling Example
Here's a complete example demonstrating robust exception handling:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException
import logging
import time
def scrape_with_error_handling(url, max_retries=3):
"""
Scrape a webpage with comprehensive error handling and retry logic.
"""
for attempt in range(max_retries):
try:
# Make HTTP request with timeout
response = requests.get(
url,
timeout=10,
headers={'User-Agent': 'Mozilla/5.0 (compatible; scraper)'}
)
response.raise_for_status()
# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Extract data with safe methods
data = extract_data_safely(soup)
return data
except HTTPError as e:
status_code = e.response.status_code
if status_code == 404:
logging.error(f"Page not found: {url}")
return None # Don't retry for 404s
elif status_code == 429:
logging.warning(f"Rate limited. Waiting before retry {attempt + 1}")
time.sleep(2 ** attempt) # Exponential backoff
else:
logging.error(f"HTTP error {status_code}: {e}")
except ConnectionError:
logging.error(f"Connection failed for {url}")
time.sleep(1) # Brief pause before retry
except Timeout:
logging.error(f"Request timeout for {url}")
time.sleep(1)
except RequestException as e:
logging.error(f"Request failed: {e}")
except Exception as e:
logging.error(f"Unexpected error: {e}")
# Wait before retry (except on last attempt)
if attempt < max_retries - 1:
time.sleep(1)
logging.error(f"Failed to scrape {url} after {max_retries} attempts")
return None
def extract_data_safely(soup):
"""
Extract data from soup with safe methods to avoid exceptions.
"""
data = {}
# Safe text extraction
title_elem = soup.find('h1')
data['title'] = title_elem.get_text().strip() if title_elem else 'No title'
# Safe attribute access
meta_desc = soup.find('meta', attrs={'name': 'description'})
data['description'] = meta_desc.get('content', '') if meta_desc else ''
# Safe list access
paragraphs = soup.find_all('p')
data['paragraphs'] = [p.get_text().strip() for p in paragraphs[:5]] # First 5 only
# Safe nested element access
nav_links = []
nav = soup.find('nav')
if nav:
links = nav.find_all('a', href=True)
nav_links = [{'text': link.get_text().strip(), 'href': link['href']}
for link in links if link.get_text().strip()]
data['nav_links'] = nav_links
return data
Advanced Exception Handling Patterns
1. Context Managers for Resource Management
from contextlib import contextmanager
import requests
@contextmanager
def safe_request_session():
"""Context manager for safe HTTP sessions."""
session = requests.Session()
try:
yield session
except Exception as e:
logging.error(f"Session error: {e}")
raise
finally:
session.close()
# Usage
with safe_request_session() as session:
response = session.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
2. Defensive Programming with Beautiful Soup
def safe_find_text(soup, selector, default=''):
"""Safely find and extract text from an element."""
try:
element = soup.select_one(selector)
return element.get_text().strip() if element else default
except (AttributeError, TypeError):
return default
def safe_find_attribute(soup, selector, attr, default=''):
"""Safely extract attribute value from an element."""
try:
element = soup.select_one(selector)
return element.get(attr, default) if element else default
except (AttributeError, TypeError):
return default
# Usage
title = safe_find_text(soup, 'h1', 'No title found')
image_url = safe_find_attribute(soup, 'img.main', 'src', '')
3. Retry Logic with Exponential Backoff
import time
import random
from functools import wraps
def retry_with_backoff(max_retries=3, base_delay=1):
"""Decorator for retrying functions with exponential backoff."""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except (ConnectionError, Timeout) as e:
if attempt == max_retries - 1:
raise e
# Exponential backoff with jitter
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
time.sleep(delay)
return None
return wrapper
return decorator
@retry_with_backoff(max_retries=3)
def fetch_and_parse(url):
response = requests.get(url, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
Best Practices for Exception Handling
1. Use Specific Exception Types
# Good: Specific exception handling
try:
soup.find('h1').get_text()
except AttributeError:
# Handle missing element specifically
pass
# Avoid: Catching all exceptions
try:
soup.find('h1').get_text()
except Exception:
# Too broad, masks other issues
pass
2. Implement Proper Logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
try:
response = requests.get(url)
response.raise_for_status()
except HTTPError as e:
logger.error(f"HTTP error {e.response.status_code} for {url}: {e}")
except ConnectionError:
logger.error(f"Connection failed for {url}")
3. Graceful Degradation
def scrape_product(soup):
"""Extract product data with graceful degradation."""
product = {}
# Essential data - fail if missing
try:
product['name'] = soup.find('h1', class_='product-title').get_text().strip()
except AttributeError:
raise ValueError("Product name not found")
# Optional data - continue if missing
try:
product['price'] = soup.find('span', class_='price').get_text().strip()
except AttributeError:
product['price'] = 'Price not available'
return product
4. Validate Data After Extraction
def validate_scraped_data(data):
"""Validate scraped data before processing."""
required_fields = ['title', 'url']
for field in required_fields:
if not data.get(field):
raise ValueError(f"Missing required field: {field}")
# Validate URL format
if not data['url'].startswith(('http://', 'https://')):
raise ValueError(f"Invalid URL format: {data['url']}")
return True
By implementing these exception handling patterns, you can build Beautiful Soup scrapers that are robust, maintainable, and capable of handling the unpredictable nature of web content. Remember to always log errors appropriately and implement retry logic for transient failures.