Robust error handling is essential when using lxml
for web scraping. HTML documents can be malformed, elements may not exist, and network issues can occur. Here's how to handle common exceptions and build resilient scraping scripts.
Common lxml Exceptions
1. XMLSyntaxError - Malformed Documents
XMLSyntaxError
occurs when parsing severely malformed HTML/XML documents.
from lxml import etree
def parse_html_safely(html_content):
try:
tree = etree.HTML(html_content)
return tree
except etree.XMLSyntaxError as e:
print(f"Parse error: {e}")
# Try with HTML parser's recover mode
try:
parser = etree.HTMLParser(recover=True)
tree = etree.HTML(html_content, parser)
return tree
except etree.XMLSyntaxError:
print("Unable to parse HTML even with recovery mode")
return None
2. XPathEvalError - Invalid XPath Expressions
XPath syntax errors can crash your script if not handled properly.
def safe_xpath(tree, xpath_expression):
try:
result = tree.xpath(xpath_expression)
return result
except etree.XPathEvalError as e:
print(f"XPath error: {e}")
return []
# Example usage
tree = etree.HTML("<html><body><div>Content</div></body></html>")
elements = safe_xpath(tree, '//div[@class="content"]') # Safe XPath call
3. Element Not Found Handling
lxml returns None
for missing elements, which can cause AttributeError if not checked.
def extract_text_safely(tree, xpath):
"""Extract text with proper None checking"""
element = tree.find(xpath)
if element is not None:
return element.text or "" # Handle None text
return ""
def extract_attribute_safely(tree, xpath, attr):
"""Extract attribute with proper None checking"""
element = tree.find(xpath)
if element is not None:
return element.get(attr, "") # Default to empty string
return ""
# Example usage
tree = etree.HTML("<html><body><h1>Title</h1></body></html>")
title = extract_text_safely(tree, './/h1')
link_href = extract_attribute_safely(tree, './/a', 'href')
Network-Related Error Handling
Complete HTTP Request Handling
import requests
from lxml import etree
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
def create_session_with_retries():
"""Create session with retry strategy"""
session = requests.Session()
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def scrape_url_safely(url):
"""Comprehensive error handling for web scraping"""
session = create_session_with_retries()
try:
response = session.get(url, timeout=10)
response.raise_for_status()
# Parse HTML
tree = etree.HTML(response.content)
return tree
except requests.exceptions.Timeout:
print(f"Timeout error for {url}")
except requests.exceptions.HTTPError as e:
print(f"HTTP error {e.response.status_code} for {url}")
except requests.exceptions.ConnectionError:
print(f"Connection error for {url}")
except requests.exceptions.RequestException as e:
print(f"Request error for {url}: {e}")
except etree.XMLSyntaxError as e:
print(f"Parse error for {url}: {e}")
return None
Encoding and Text Processing
Handling Encoding Issues
def parse_with_encoding_fallback(content):
"""Try different encodings if parsing fails"""
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
for encoding in encodings:
try:
if isinstance(content, bytes):
decoded_content = content.decode(encoding)
else:
decoded_content = content
parser = etree.HTMLParser(encoding=encoding)
tree = etree.HTML(decoded_content, parser)
return tree
except (UnicodeDecodeError, etree.ParserError, ValueError):
continue
print("Failed to parse content with any encoding")
return None
Safe Text Extraction
def extract_clean_text(element):
"""Extract and clean text from element"""
if element is None:
return ""
try:
# Get text content, handling None values
text = element.text_content() if hasattr(element, 'text_content') else (element.text or "")
# Clean whitespace
return ' '.join(text.split()) if text else ""
except Exception as e:
print(f"Error extracting text: {e}")
return ""
Comprehensive Web Scraper with Error Handling
import requests
from lxml import etree
import logging
from urllib.parse import urljoin, urlparse
class RobustScraper:
def __init__(self, delay=1):
self.session = self.create_session()
self.delay = delay
self.setup_logging()
def setup_logging(self):
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def create_session(self):
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
return session
def scrape_page(self, url):
"""Scrape a single page with comprehensive error handling"""
try:
# Rate limiting
time.sleep(self.delay)
response = self.session.get(url, timeout=10)
response.raise_for_status()
tree = etree.HTML(response.content)
if tree is None:
self.logger.error(f"Failed to parse HTML for {url}")
return None
return self.extract_data(tree, url)
except requests.exceptions.RequestException as e:
self.logger.error(f"Request failed for {url}: {e}")
except etree.XMLSyntaxError as e:
self.logger.error(f"Parse error for {url}: {e}")
except Exception as e:
self.logger.error(f"Unexpected error for {url}: {e}")
return None
def extract_data(self, tree, base_url):
"""Extract data with safe methods"""
data = {}
# Safe title extraction
title_elem = tree.find('.//title')
data['title'] = title_elem.text if title_elem is not None else "No title"
# Safe link extraction with URL joining
links = []
for link in tree.xpath('.//a[@href]'):
href = link.get('href')
if href:
absolute_url = urljoin(base_url, href)
links.append({
'text': extract_clean_text(link),
'url': absolute_url
})
data['links'] = links
return data
# Usage example
scraper = RobustScraper(delay=1)
urls = ['https://example.com', 'https://httpbin.org/html']
for url in urls:
result = scraper.scrape_page(url)
if result:
print(f"Successfully scraped {url}")
else:
print(f"Failed to scrape {url}")
Best Practices Summary
- Always check for None: Elements and text can be None
- Use try-except blocks: Wrap parsing and XPath operations
- Implement retries: Handle temporary network failures
- Set timeouts: Prevent hanging requests
- Handle encoding: Try multiple encodings if needed
- Log errors: Track failures for debugging
- Rate limiting: Respect server resources
- Graceful degradation: Continue processing even if some elements fail
By implementing these error handling patterns, your lxml-based web scrapers will be more reliable and maintainable in production environments.