How do I handle errors and exceptions when using lxml for web scraping?

Robust error handling is essential when using lxml for web scraping. HTML documents can be malformed, elements may not exist, and network issues can occur. Here's how to handle common exceptions and build resilient scraping scripts.

Common lxml Exceptions

1. XMLSyntaxError - Malformed Documents

XMLSyntaxError occurs when parsing severely malformed HTML/XML documents.

from lxml import etree

def parse_html_safely(html_content):
    try:
        tree = etree.HTML(html_content)
        return tree
    except etree.XMLSyntaxError as e:
        print(f"Parse error: {e}")
        # Try with HTML parser's recover mode
        try:
            parser = etree.HTMLParser(recover=True)
            tree = etree.HTML(html_content, parser)
            return tree
        except etree.XMLSyntaxError:
            print("Unable to parse HTML even with recovery mode")
            return None

2. XPathEvalError - Invalid XPath Expressions

XPath syntax errors can crash your script if not handled properly.

def safe_xpath(tree, xpath_expression):
    try:
        result = tree.xpath(xpath_expression)
        return result
    except etree.XPathEvalError as e:
        print(f"XPath error: {e}")
        return []

# Example usage
tree = etree.HTML("<html><body><div>Content</div></body></html>")
elements = safe_xpath(tree, '//div[@class="content"]')  # Safe XPath call

3. Element Not Found Handling

lxml returns None for missing elements, which can cause AttributeError if not checked.

def extract_text_safely(tree, xpath):
    """Extract text with proper None checking"""
    element = tree.find(xpath)
    if element is not None:
        return element.text or ""  # Handle None text
    return ""

def extract_attribute_safely(tree, xpath, attr):
    """Extract attribute with proper None checking"""
    element = tree.find(xpath)
    if element is not None:
        return element.get(attr, "")  # Default to empty string
    return ""

# Example usage
tree = etree.HTML("<html><body><h1>Title</h1></body></html>")
title = extract_text_safely(tree, './/h1')
link_href = extract_attribute_safely(tree, './/a', 'href')

Network-Related Error Handling

Complete HTTP Request Handling

import requests
from lxml import etree
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time

def create_session_with_retries():
    """Create session with retry strategy"""
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def scrape_url_safely(url):
    """Comprehensive error handling for web scraping"""
    session = create_session_with_retries()

    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()

        # Parse HTML
        tree = etree.HTML(response.content)
        return tree

    except requests.exceptions.Timeout:
        print(f"Timeout error for {url}")
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error {e.response.status_code} for {url}")
    except requests.exceptions.ConnectionError:
        print(f"Connection error for {url}")
    except requests.exceptions.RequestException as e:
        print(f"Request error for {url}: {e}")
    except etree.XMLSyntaxError as e:
        print(f"Parse error for {url}: {e}")

    return None

Encoding and Text Processing

Handling Encoding Issues

def parse_with_encoding_fallback(content):
    """Try different encodings if parsing fails"""
    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']

    for encoding in encodings:
        try:
            if isinstance(content, bytes):
                decoded_content = content.decode(encoding)
            else:
                decoded_content = content

            parser = etree.HTMLParser(encoding=encoding)
            tree = etree.HTML(decoded_content, parser)
            return tree

        except (UnicodeDecodeError, etree.ParserError, ValueError):
            continue

    print("Failed to parse content with any encoding")
    return None

Safe Text Extraction

def extract_clean_text(element):
    """Extract and clean text from element"""
    if element is None:
        return ""

    try:
        # Get text content, handling None values
        text = element.text_content() if hasattr(element, 'text_content') else (element.text or "")
        # Clean whitespace
        return ' '.join(text.split()) if text else ""
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

Comprehensive Web Scraper with Error Handling

import requests
from lxml import etree
import logging
from urllib.parse import urljoin, urlparse

class RobustScraper:
    def __init__(self, delay=1):
        self.session = self.create_session()
        self.delay = delay
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def create_session(self):
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        return session

    def scrape_page(self, url):
        """Scrape a single page with comprehensive error handling"""
        try:
            # Rate limiting
            time.sleep(self.delay)

            response = self.session.get(url, timeout=10)
            response.raise_for_status()

            tree = etree.HTML(response.content)
            if tree is None:
                self.logger.error(f"Failed to parse HTML for {url}")
                return None

            return self.extract_data(tree, url)

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed for {url}: {e}")
        except etree.XMLSyntaxError as e:
            self.logger.error(f"Parse error for {url}: {e}")
        except Exception as e:
            self.logger.error(f"Unexpected error for {url}: {e}")

        return None

    def extract_data(self, tree, base_url):
        """Extract data with safe methods"""
        data = {}

        # Safe title extraction
        title_elem = tree.find('.//title')
        data['title'] = title_elem.text if title_elem is not None else "No title"

        # Safe link extraction with URL joining
        links = []
        for link in tree.xpath('.//a[@href]'):
            href = link.get('href')
            if href:
                absolute_url = urljoin(base_url, href)
                links.append({
                    'text': extract_clean_text(link),
                    'url': absolute_url
                })
        data['links'] = links

        return data

# Usage example
scraper = RobustScraper(delay=1)
urls = ['https://example.com', 'https://httpbin.org/html']

for url in urls:
    result = scraper.scrape_page(url)
    if result:
        print(f"Successfully scraped {url}")
    else:
        print(f"Failed to scrape {url}")

Best Practices Summary

  1. Always check for None: Elements and text can be None
  2. Use try-except blocks: Wrap parsing and XPath operations
  3. Implement retries: Handle temporary network failures
  4. Set timeouts: Prevent hanging requests
  5. Handle encoding: Try multiple encodings if needed
  6. Log errors: Track failures for debugging
  7. Rate limiting: Respect server resources
  8. Graceful degradation: Continue processing even if some elements fail

By implementing these error handling patterns, your lxml-based web scrapers will be more reliable and maintainable in production environments.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon