Table of contents

How do I use lxml to validate XML documents against a schema?

Overview

XML validation using lxml allows you to verify that XML documents conform to a specified schema (XSD). The lxml.etree module provides comprehensive tools for XML Schema validation with detailed error reporting.

Installation

pip install lxml

Basic Validation Process

1. Load XML Schema (XSD)

from lxml import etree

# Load schema from file
with open('schema.xsd', 'rb') as schema_file:
    xmlschema_doc = etree.parse(schema_file)
    xmlschema = etree.XMLSchema(xmlschema_doc)

# Alternative: Load schema from string
schema_string = """
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="book">
        <xs:complexType>
            <xs:sequence>
                <xs:element name="title" type="xs:string"/>
                <xs:element name="author" type="xs:string"/>
                <xs:element name="year" type="xs:int"/>
            </xs:sequence>
        </xs:complexType>
    </xs:element>
</xs:schema>
"""
xmlschema_doc = etree.fromstring(schema_string)
xmlschema = etree.XMLSchema(xmlschema_doc)

2. Parse XML Document

# From file
xml_document = etree.parse('document.xml')

# From string
xml_string = """
<book>
    <title>Python Programming</title>
    <author>John Doe</author>
    <year>2023</year>
</book>
"""
xml_document = etree.fromstring(xml_string)

3. Validate Document

# Perform validation
is_valid = xmlschema.validate(xml_document)

if is_valid:
    print("✓ XML document is valid")
else:
    print("✗ XML document is invalid")
    # Print validation errors
    for error in xmlschema.error_log:
        print(f"Error: {error}")

Complete Example with Error Handling

from lxml import etree

def validate_xml_with_schema(xml_file, schema_file):
    """
    Validate XML document against XSD schema

    Args:
        xml_file (str): Path to XML document
        schema_file (str): Path to XSD schema

    Returns:
        tuple: (is_valid, errors)
    """
    try:
        # Load and parse schema
        with open(schema_file, 'rb') as xsd_file:
            schema_doc = etree.parse(xsd_file)
            schema = etree.XMLSchema(schema_doc)

        # Load and parse XML document
        with open(xml_file, 'rb') as xml_file:
            xml_doc = etree.parse(xml_file)

        # Validate
        is_valid = schema.validate(xml_doc)
        errors = [str(error) for error in schema.error_log]

        return is_valid, errors

    except etree.XMLSchemaParseError as e:
        return False, [f"Schema parse error: {e}"]
    except etree.XMLSyntaxError as e:
        return False, [f"XML syntax error: {e}"]
    except FileNotFoundError as e:
        return False, [f"File not found: {e}"]
    except Exception as e:
        return False, [f"Unexpected error: {e}"]

# Usage
is_valid, errors = validate_xml_with_schema('document.xml', 'schema.xsd')

if is_valid:
    print("Document is valid!")
else:
    print("Validation failed:")
    for error in errors:
        print(f"  - {error}")

Working with Different Schema Types

DTD Validation

from lxml import etree

# Load DTD
with open('document.dtd', 'r') as dtd_file:
    dtd = etree.DTD(dtd_file)

# Parse XML
xml_doc = etree.parse('document.xml')

# Validate against DTD
if dtd.validate(xml_doc):
    print("Valid according to DTD")
else:
    print("DTD validation errors:")
    print(dtd.error_log)

RelaxNG Validation

from lxml import etree

# Load RelaxNG schema
with open('schema.rng', 'rb') as rng_file:
    rng_doc = etree.parse(rng_file)
    rng = etree.RelaxNG(rng_doc)

# Validate
xml_doc = etree.parse('document.xml')
is_valid = rng.validate(xml_doc)

Advanced Validation Techniques

Custom Error Handling

from lxml import etree

def detailed_validation_report(xml_doc, schema):
    """Generate detailed validation report"""
    is_valid = schema.validate(xml_doc)

    if is_valid:
        return {"valid": True, "errors": []}

    errors = []
    for error in schema.error_log:
        errors.append({
            "line": error.line,
            "column": error.column,
            "message": error.message,
            "level": error.level_name,
            "type": error.type_name
        })

    return {"valid": False, "errors": errors}

# Usage
xml_doc = etree.parse('document.xml')
report = detailed_validation_report(xml_doc, xmlschema)

if not report["valid"]:
    for error in report["errors"]:
        print(f"Line {error['line']}: {error['message']}")

Validating Multiple Documents

import os
from lxml import etree

def validate_directory(xml_dir, schema_file):
    """Validate all XML files in a directory"""
    # Load schema once
    with open(schema_file, 'rb') as f:
        schema_doc = etree.parse(f)
        schema = etree.XMLSchema(schema_doc)

    results = {}

    for filename in os.listdir(xml_dir):
        if filename.endswith('.xml'):
            filepath = os.path.join(xml_dir, filename)
            try:
                xml_doc = etree.parse(filepath)
                is_valid = schema.validate(xml_doc)
                results[filename] = {
                    "valid": is_valid,
                    "errors": [str(e) for e in schema.error_log]
                }
            except Exception as e:
                results[filename] = {
                    "valid": False,
                    "errors": [str(e)]
                }

    return results

# Usage
results = validate_directory('./xml_files', 'schema.xsd')
for filename, result in results.items():
    status = "✓" if result["valid"] else "✗"
    print(f"{status} {filename}")

Best Practices

  1. Reuse Schema Objects: Load schemas once and reuse them for multiple validations
  2. Handle Encoding: Always open files in binary mode ('rb') to let lxml handle encoding
  3. Error Logging: Always check error_log for detailed validation errors
  4. Exception Handling: Catch XMLSchemaParseError and XMLSyntaxError specifically
  5. Performance: For large-scale validation, consider using XMLParser with schema validation enabled

Common Issues and Solutions

  • Encoding Problems: Use binary mode when opening files
  • Namespace Issues: Ensure XML namespaces match schema definitions
  • Memory Usage: For large documents, consider streaming validation
  • Schema Loading: Verify schema file paths and accessibility

This comprehensive approach to XML validation with lxml ensures robust document verification with proper error handling and reporting.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon