Overview
XML validation using lxml
allows you to verify that XML documents conform to a specified schema (XSD). The lxml.etree
module provides comprehensive tools for XML Schema validation with detailed error reporting.
Installation
pip install lxml
Basic Validation Process
1. Load XML Schema (XSD)
from lxml import etree
# Load schema from file
with open('schema.xsd', 'rb') as schema_file:
xmlschema_doc = etree.parse(schema_file)
xmlschema = etree.XMLSchema(xmlschema_doc)
# Alternative: Load schema from string
schema_string = """
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="book">
<xs:complexType>
<xs:sequence>
<xs:element name="title" type="xs:string"/>
<xs:element name="author" type="xs:string"/>
<xs:element name="year" type="xs:int"/>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
"""
xmlschema_doc = etree.fromstring(schema_string)
xmlschema = etree.XMLSchema(xmlschema_doc)
2. Parse XML Document
# From file
xml_document = etree.parse('document.xml')
# From string
xml_string = """
<book>
<title>Python Programming</title>
<author>John Doe</author>
<year>2023</year>
</book>
"""
xml_document = etree.fromstring(xml_string)
3. Validate Document
# Perform validation
is_valid = xmlschema.validate(xml_document)
if is_valid:
print("✓ XML document is valid")
else:
print("✗ XML document is invalid")
# Print validation errors
for error in xmlschema.error_log:
print(f"Error: {error}")
Complete Example with Error Handling
from lxml import etree
def validate_xml_with_schema(xml_file, schema_file):
"""
Validate XML document against XSD schema
Args:
xml_file (str): Path to XML document
schema_file (str): Path to XSD schema
Returns:
tuple: (is_valid, errors)
"""
try:
# Load and parse schema
with open(schema_file, 'rb') as xsd_file:
schema_doc = etree.parse(xsd_file)
schema = etree.XMLSchema(schema_doc)
# Load and parse XML document
with open(xml_file, 'rb') as xml_file:
xml_doc = etree.parse(xml_file)
# Validate
is_valid = schema.validate(xml_doc)
errors = [str(error) for error in schema.error_log]
return is_valid, errors
except etree.XMLSchemaParseError as e:
return False, [f"Schema parse error: {e}"]
except etree.XMLSyntaxError as e:
return False, [f"XML syntax error: {e}"]
except FileNotFoundError as e:
return False, [f"File not found: {e}"]
except Exception as e:
return False, [f"Unexpected error: {e}"]
# Usage
is_valid, errors = validate_xml_with_schema('document.xml', 'schema.xsd')
if is_valid:
print("Document is valid!")
else:
print("Validation failed:")
for error in errors:
print(f" - {error}")
Working with Different Schema Types
DTD Validation
from lxml import etree
# Load DTD
with open('document.dtd', 'r') as dtd_file:
dtd = etree.DTD(dtd_file)
# Parse XML
xml_doc = etree.parse('document.xml')
# Validate against DTD
if dtd.validate(xml_doc):
print("Valid according to DTD")
else:
print("DTD validation errors:")
print(dtd.error_log)
RelaxNG Validation
from lxml import etree
# Load RelaxNG schema
with open('schema.rng', 'rb') as rng_file:
rng_doc = etree.parse(rng_file)
rng = etree.RelaxNG(rng_doc)
# Validate
xml_doc = etree.parse('document.xml')
is_valid = rng.validate(xml_doc)
Advanced Validation Techniques
Custom Error Handling
from lxml import etree
def detailed_validation_report(xml_doc, schema):
"""Generate detailed validation report"""
is_valid = schema.validate(xml_doc)
if is_valid:
return {"valid": True, "errors": []}
errors = []
for error in schema.error_log:
errors.append({
"line": error.line,
"column": error.column,
"message": error.message,
"level": error.level_name,
"type": error.type_name
})
return {"valid": False, "errors": errors}
# Usage
xml_doc = etree.parse('document.xml')
report = detailed_validation_report(xml_doc, xmlschema)
if not report["valid"]:
for error in report["errors"]:
print(f"Line {error['line']}: {error['message']}")
Validating Multiple Documents
import os
from lxml import etree
def validate_directory(xml_dir, schema_file):
"""Validate all XML files in a directory"""
# Load schema once
with open(schema_file, 'rb') as f:
schema_doc = etree.parse(f)
schema = etree.XMLSchema(schema_doc)
results = {}
for filename in os.listdir(xml_dir):
if filename.endswith('.xml'):
filepath = os.path.join(xml_dir, filename)
try:
xml_doc = etree.parse(filepath)
is_valid = schema.validate(xml_doc)
results[filename] = {
"valid": is_valid,
"errors": [str(e) for e in schema.error_log]
}
except Exception as e:
results[filename] = {
"valid": False,
"errors": [str(e)]
}
return results
# Usage
results = validate_directory('./xml_files', 'schema.xsd')
for filename, result in results.items():
status = "✓" if result["valid"] else "✗"
print(f"{status} {filename}")
Best Practices
- Reuse Schema Objects: Load schemas once and reuse them for multiple validations
- Handle Encoding: Always open files in binary mode ('rb') to let lxml handle encoding
- Error Logging: Always check
error_log
for detailed validation errors - Exception Handling: Catch
XMLSchemaParseError
andXMLSyntaxError
specifically - Performance: For large-scale validation, consider using
XMLParser
with schema validation enabled
Common Issues and Solutions
- Encoding Problems: Use binary mode when opening files
- Namespace Issues: Ensure XML namespaces match schema definitions
- Memory Usage: For large documents, consider streaming validation
- Schema Loading: Verify schema file paths and accessibility
This comprehensive approach to XML validation with lxml ensures robust document verification with proper error handling and reporting.