How do I use lxml to parse and manipulate RSS or Atom feeds?
lxml is a powerful Python library that excels at parsing and manipulating XML documents, making it an excellent choice for working with RSS and Atom feeds. This guide will show you how to effectively parse, extract data from, and manipulate RSS/Atom feeds using lxml.
Understanding RSS and Atom Feed Structure
Before diving into code, it's important to understand the basic structure of RSS and Atom feeds:
RSS 2.0 Structure:
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Feed Title</title>
<description>Feed Description</description>
<link>https://example.com</link>
<item>
<title>Article Title</title>
<description>Article content</description>
<link>https://example.com/article</link>
<pubDate>Mon, 01 Jan 2024 00:00:00 GMT</pubDate>
</item>
</channel>
</rss>
Atom 1.0 Structure:
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Feed Title</title>
<link href="https://example.com"/>
<updated>2024-01-01T00:00:00Z</updated>
<entry>
<title>Article Title</title>
<link href="https://example.com/article"/>
<updated>2024-01-01T00:00:00Z</updated>
<summary>Article summary</summary>
</entry>
</feed>
Installing and Setting Up lxml
First, install lxml using pip:
pip install lxml
For systems that require compilation, you might need additional dependencies:
# Ubuntu/Debian
sudo apt-get install libxml2-dev libxslt-dev python3-dev
# macOS with Homebrew
brew install libxml2 libxslt
# Then install lxml
pip install lxml
Basic RSS Feed Parsing
Here's a comprehensive example of parsing an RSS feed:
import requests
from lxml import etree
from datetime import datetime
import re
def parse_rss_feed(url):
"""Parse RSS feed and extract articles"""
try:
# Fetch the RSS feed
response = requests.get(url, timeout=10)
response.raise_for_status()
# Parse XML with lxml
root = etree.fromstring(response.content)
# Extract channel information
channel = root.find('.//channel')
feed_info = {
'title': get_text(channel, 'title'),
'description': get_text(channel, 'description'),
'link': get_text(channel, 'link'),
'language': get_text(channel, 'language'),
'lastBuildDate': get_text(channel, 'lastBuildDate')
}
# Extract items
items = []
for item in root.xpath('//item'):
article = {
'title': get_text(item, 'title'),
'description': get_text(item, 'description'),
'link': get_text(item, 'link'),
'pubDate': get_text(item, 'pubDate'),
'author': get_text(item, 'author'),
'category': get_text(item, 'category'),
'guid': get_text(item, 'guid')
}
# Clean HTML from description
if article['description']:
article['description'] = clean_html(article['description'])
items.append(article)
return {
'feed': feed_info,
'items': items
}
except requests.RequestException as e:
print(f"Error fetching RSS feed: {e}")
return None
except etree.XMLSyntaxError as e:
print(f"Error parsing XML: {e}")
return None
def get_text(element, tag):
"""Safely extract text from XML element"""
child = element.find(tag)
return child.text.strip() if child is not None and child.text else None
def clean_html(text):
"""Remove HTML tags from text"""
clean = re.compile('<.*?>')
return re.sub(clean, '', text).strip()
# Example usage
rss_data = parse_rss_feed('https://feeds.feedburner.com/TechCrunch')
if rss_data:
print(f"Feed: {rss_data['feed']['title']}")
for item in rss_data['items'][:5]: # Show first 5 items
print(f"- {item['title']}")
Advanced RSS Parsing with Namespaces
Many RSS feeds use namespaces for extended functionality. Here's how to handle them:
from lxml import etree
def parse_rss_with_namespaces(url):
"""Parse RSS feed with namespace support"""
namespaces = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'dc': 'http://purl.org/dc/elements/1.1/',
'media': 'http://search.yahoo.com/mrss/',
'atom': 'http://www.w3.org/2005/Atom'
}
response = requests.get(url, timeout=10)
root = etree.fromstring(response.content)
items = []
for item in root.xpath('//item'):
article = {
'title': get_text(item, 'title'),
'link': get_text(item, 'link'),
'pubDate': get_text(item, 'pubDate'),
'description': get_text(item, 'description'),
# Extended content using namespaces
'content_encoded': get_namespaced_text(item, 'content:encoded', namespaces),
'dc_creator': get_namespaced_text(item, 'dc:creator', namespaces),
'dc_subject': get_namespaced_text(item, 'dc:subject', namespaces),
# Media elements
'media_thumbnail': get_media_thumbnail(item, namespaces),
'enclosures': get_enclosures(item)
}
items.append(article)
return items
def get_namespaced_text(element, xpath, namespaces):
"""Extract text using namespaced XPath"""
result = element.xpath(xpath, namespaces=namespaces)
return result[0].text.strip() if result and result[0].text else None
def get_media_thumbnail(element, namespaces):
"""Extract media thumbnail URL"""
thumbnails = element.xpath('.//media:thumbnail/@url', namespaces=namespaces)
return thumbnails[0] if thumbnails else None
def get_enclosures(element):
"""Extract enclosure information (podcasts, etc.)"""
enclosures = []
for enclosure in element.xpath('.//enclosure'):
enclosures.append({
'url': enclosure.get('url'),
'type': enclosure.get('type'),
'length': enclosure.get('length')
})
return enclosures
Parsing Atom Feeds
Atom feeds have a different structure and always use namespaces:
def parse_atom_feed(url):
"""Parse Atom feed"""
atom_ns = {'atom': 'http://www.w3.org/2005/Atom'}
response = requests.get(url, timeout=10)
root = etree.fromstring(response.content)
# Extract feed information
feed_info = {
'title': get_atom_text(root, './/atom:title', atom_ns),
'link': get_atom_link(root, atom_ns),
'updated': get_atom_text(root, './/atom:updated', atom_ns),
'subtitle': get_atom_text(root, './/atom:subtitle', atom_ns),
'id': get_atom_text(root, './/atom:id', atom_ns)
}
# Extract entries
entries = []
for entry in root.xpath('.//atom:entry', namespaces=atom_ns):
article = {
'title': get_atom_text(entry, './/atom:title', atom_ns),
'link': get_atom_entry_link(entry, atom_ns),
'updated': get_atom_text(entry, './/atom:updated', atom_ns),
'published': get_atom_text(entry, './/atom:published', atom_ns),
'summary': get_atom_text(entry, './/atom:summary', atom_ns),
'content': get_atom_content(entry, atom_ns),
'author': get_atom_author(entry, atom_ns),
'id': get_atom_text(entry, './/atom:id', atom_ns)
}
entries.append(article)
return {
'feed': feed_info,
'entries': entries
}
def get_atom_text(element, xpath, namespaces):
"""Extract text from Atom element"""
result = element.xpath(xpath, namespaces=namespaces)
return result[0].text.strip() if result and result[0].text else None
def get_atom_link(element, namespaces):
"""Extract link from Atom feed/entry"""
links = element.xpath('.//atom:link[@rel="alternate"]/@href', namespaces=namespaces)
if not links:
links = element.xpath('.//atom:link/@href', namespaces=namespaces)
return links[0] if links else None
def get_atom_entry_link(entry, namespaces):
"""Extract entry link with fallback logic"""
# Try alternate link first
links = entry.xpath('.//atom:link[@rel="alternate"]/@href', namespaces=namespaces)
if not links:
# Fallback to any link
links = entry.xpath('.//atom:link/@href', namespaces=namespaces)
return links[0] if links else None
def get_atom_content(entry, namespaces):
"""Extract content from Atom entry"""
content = entry.xpath('.//atom:content', namespaces=namespaces)
if content and content[0].text:
return content[0].text.strip()
return None
def get_atom_author(entry, namespaces):
"""Extract author information"""
author_name = entry.xpath('.//atom:author/atom:name/text()', namespaces=namespaces)
author_email = entry.xpath('.//atom:author/atom:email/text()', namespaces=namespaces)
if author_name:
author = author_name[0]
if author_email:
author += f" ({author_email[0]})"
return author
return None
Feed Manipulation and Creation
You can also create and modify feeds using lxml:
from lxml import etree
from datetime import datetime, timezone
def create_rss_feed(title, description, link, items):
"""Create a new RSS feed"""
# Create root RSS element
rss = etree.Element("rss", version="2.0")
channel = etree.SubElement(rss, "channel")
# Add channel information
etree.SubElement(channel, "title").text = title
etree.SubElement(channel, "description").text = description
etree.SubElement(channel, "link").text = link
etree.SubElement(channel, "lastBuildDate").text = datetime.now(timezone.utc).strftime('%a, %d %b %Y %H:%M:%S %z')
# Add items
for item_data in items:
item = etree.SubElement(channel, "item")
etree.SubElement(item, "title").text = item_data.get('title', '')
etree.SubElement(item, "description").text = item_data.get('description', '')
etree.SubElement(item, "link").text = item_data.get('link', '')
etree.SubElement(item, "pubDate").text = item_data.get('pubDate', '')
if item_data.get('guid'):
etree.SubElement(item, "guid").text = item_data['guid']
return etree.tostring(rss, pretty_print=True, xml_declaration=True, encoding='UTF-8')
def modify_feed_items(feed_content, filter_func=None, transform_func=None):
"""Modify existing feed items"""
root = etree.fromstring(feed_content)
# Find all items
items = root.xpath('//item')
for item in items[:]: # Use slice to avoid modifying list while iterating
# Apply filter
if filter_func and not filter_func(item):
item.getparent().remove(item)
continue
# Apply transformation
if transform_func:
transform_func(item)
return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='UTF-8')
# Example: Filter items by date and modify titles
def recent_items_only(item):
"""Filter function to keep only recent items"""
pub_date = item.find('pubDate')
if pub_date is not None and pub_date.text:
# Simple date check (you might want more sophisticated parsing)
return '2024' in pub_date.text
return True
def add_prefix_to_title(item):
"""Transform function to add prefix to titles"""
title = item.find('title')
if title is not None and title.text:
title.text = f"[MODIFIED] {title.text}"
Error Handling and Validation
Robust feed parsing requires proper error handling:
def safe_feed_parser(url, feed_type='auto'):
"""Safely parse feed with comprehensive error handling"""
try:
response = requests.get(url, timeout=15, headers={
'User-Agent': 'Mozilla/5.0 (compatible; FeedParser/1.0)'
})
response.raise_for_status()
# Validate content type
content_type = response.headers.get('content-type', '').lower()
if 'xml' not in content_type and 'rss' not in content_type and 'atom' not in content_type:
print(f"Warning: Unexpected content type: {content_type}")
# Parse with recovery
parser = etree.XMLParser(recover=True, strip_cdata=False)
root = etree.fromstring(response.content, parser)
# Detect feed type
if feed_type == 'auto':
if root.tag == 'rss':
feed_type = 'rss'
elif root.tag.endswith('feed'):
feed_type = 'atom'
else:
raise ValueError(f"Unknown feed format: {root.tag}")
# Parse based on type
if feed_type == 'rss':
return parse_rss_feed_from_root(root)
elif feed_type == 'atom':
return parse_atom_feed_from_root(root)
else:
raise ValueError(f"Unsupported feed type: {feed_type}")
except requests.exceptions.Timeout:
print("Error: Feed request timed out")
except requests.exceptions.ConnectionError:
print("Error: Could not connect to feed URL")
except etree.XMLSyntaxError as e:
print(f"Error: Invalid XML syntax: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
return None
def validate_feed_structure(root):
"""Validate basic feed structure"""
if root.tag == 'rss':
channel = root.find('.//channel')
if channel is None:
raise ValueError("RSS feed missing channel element")
required_elements = ['title', 'description', 'link']
for element in required_elements:
if channel.find(element) is None:
print(f"Warning: Missing required element: {element}")
elif root.tag.endswith('feed'):
required_elements = ['title', 'id', 'updated']
for element in required_elements:
xpath = f'.//{{{root.nsmap[None]}}}{element}'
if root.find(xpath) is None:
print(f"Warning: Missing required Atom element: {element}")
Performance Optimization
For processing large feeds or multiple feeds, consider these optimizations:
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
async def fetch_multiple_feeds(urls, max_concurrent=5):
"""Fetch multiple feeds concurrently"""
semaphore = asyncio.Semaphore(max_concurrent)
async def fetch_single_feed(session, url):
async with semaphore:
try:
async with session.get(url, timeout=10) as response:
content = await response.read()
return url, content
except Exception as e:
print(f"Error fetching {url}: {e}")
return url, None
async with aiohttp.ClientSession() as session:
tasks = [fetch_single_feed(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return results
def parse_feeds_parallel(feed_contents):
"""Parse multiple feeds in parallel using threads"""
with ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for url, content in feed_contents:
if content:
future = executor.submit(etree.fromstring, content)
futures.append((url, future))
results = []
for url, future in futures:
try:
root = future.result()
results.append((url, root))
except Exception as e:
print(f"Error parsing {url}: {e}")
return results
Practical Applications
Here are some practical use cases for RSS/Atom feed parsing:
1. Feed Aggregator
class FeedAggregator:
def __init__(self):
self.feeds = []
def add_feed(self, url, category=None):
self.feeds.append({'url': url, 'category': category})
def aggregate_all(self):
all_items = []
for feed_info in self.feeds:
data = safe_feed_parser(feed_info['url'])
if data and 'items' in data:
for item in data['items']:
item['source_category'] = feed_info['category']
item['source_url'] = feed_info['url']
all_items.append(item)
# Sort by publication date
return sorted(all_items, key=lambda x: x.get('pubDate', ''), reverse=True)
2. Content Filter
def filter_feed_content(url, keywords, exclude_keywords=None):
"""Filter feed items based on keywords"""
data = safe_feed_parser(url)
if not data:
return []
filtered_items = []
for item in data.get('items', []):
title = (item.get('title', '') or '').lower()
description = (item.get('description', '') or '').lower()
content = f"{title} {description}"
# Check if any keyword is present
if any(keyword.lower() in content for keyword in keywords):
# Check if any exclude keyword is present
if exclude_keywords and any(keyword.lower() in content for keyword in exclude_keywords):
continue
filtered_items.append(item)
return filtered_items
Best Practices and Tips
- Always handle encoding properly: RSS feeds can have various encodings
- Use recovery parsing: Many feeds have minor XML errors
- Implement caching: Avoid fetching the same feed repeatedly
- Respect robots.txt: Check the website's robots.txt file
- Add rate limiting: Don't overwhelm servers with requests
- Validate feed URLs: Ensure URLs are properly formatted
- Handle redirects: Many feeds redirect to new URLs
When working with dynamic content that requires JavaScript execution, you might need to use tools like handling dynamic content with headless browsers for more complex scenarios.
Conclusion
lxml provides excellent support for parsing and manipulating RSS and Atom feeds with its powerful XPath support, namespace handling, and recovery parsing capabilities. The examples above demonstrate comprehensive techniques for handling real-world feeds, including error handling, performance optimization, and practical applications. Whether you're building a feed reader, content aggregator, or monitoring system, lxml gives you the tools needed for robust feed processing.
Remember to always test your feed parsing code with various feed formats and handle edge cases gracefully, as feed quality can vary significantly across different sources.