How to Extract Structured Data Like JSON-LD or Microdata Using Cheerio
Structured data is essential for SEO and providing rich information about web content. JSON-LD (JavaScript Object Notation for Linked Data) and microdata are two popular formats for embedding structured data in HTML. Cheerio, a server-side jQuery implementation for Node.js, provides powerful tools for extracting this valuable information from web pages.
Understanding Structured Data Formats
Before diving into extraction techniques, it's important to understand the different structured data formats you'll encounter:
JSON-LD
JSON-LD is the most common and Google-recommended format for structured data. It's typically embedded in <script>
tags with the type application/ld+json
:
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"name": "Example Product",
"description": "A great product for developers",
"brand": "TechCorp",
"offers": {
"@type": "Offer",
"price": "29.99",
"priceCurrency": "USD"
}
}
</script>
Microdata
Microdata uses HTML attributes to embed structured data directly into HTML elements:
<div itemscope itemtype="https://schema.org/Product">
<h1 itemprop="name">Example Product</h1>
<p itemprop="description">A great product for developers</p>
<span itemprop="brand">TechCorp</span>
<div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
<span itemprop="price">29.99</span>
<span itemprop="priceCurrency">USD</span>
</div>
</div>
Setting Up Cheerio for Structured Data Extraction
First, install the necessary dependencies:
npm install cheerio axios
Here's a basic setup for loading and parsing HTML with Cheerio:
const cheerio = require('cheerio');
const axios = require('axios');
async function loadPage(url) {
try {
const response = await axios.get(url);
return cheerio.load(response.data);
} catch (error) {
console.error('Error loading page:', error);
return null;
}
}
Extracting JSON-LD Data
JSON-LD is the easiest structured data format to extract because it's contained within script tags. Here's how to extract all JSON-LD data from a page:
function extractJsonLd($) {
const jsonLdData = [];
$('script[type="application/ld+json"]').each((index, element) => {
try {
const content = $(element).html();
const data = JSON.parse(content);
jsonLdData.push(data);
} catch (error) {
console.error('Error parsing JSON-LD:', error);
}
});
return jsonLdData;
}
// Usage example
async function scrapeJsonLd(url) {
const $ = await loadPage(url);
if (!$) return null;
const structuredData = extractJsonLd($);
return structuredData;
}
Advanced JSON-LD Extraction with Type Filtering
Often, you'll want to extract specific types of structured data. Here's an enhanced function that filters by schema type:
function extractJsonLdByType($, targetType) {
const filteredData = [];
$('script[type="application/ld+json"]').each((index, element) => {
try {
const content = $(element).html();
const data = JSON.parse(content);
// Handle both single objects and arrays
const items = Array.isArray(data) ? data : [data];
items.forEach(item => {
if (item['@type'] === targetType ||
(Array.isArray(item['@type']) && item['@type'].includes(targetType))) {
filteredData.push(item);
}
});
} catch (error) {
console.error('Error parsing JSON-LD:', error);
}
});
return filteredData;
}
// Extract only Product data
const products = extractJsonLdByType($, 'Product');
console.log('Products found:', products);
Extracting Microdata
Microdata extraction is more complex because the data is embedded within HTML attributes. Here's a comprehensive function to extract microdata:
function extractMicrodata($) {
const microdataItems = [];
$('[itemscope]').each((index, element) => {
const $element = $(element);
const item = {
type: $element.attr('itemtype'),
properties: {}
};
// Extract properties from the current element and its children
$element.find('[itemprop]').addBack('[itemprop]').each((propIndex, propElement) => {
const $prop = $(propElement);
const propName = $prop.attr('itemprop');
if (!propName) return;
let propValue;
// Determine the property value based on element type
if ($prop.is('meta')) {
propValue = $prop.attr('content');
} else if ($prop.is('link, a')) {
propValue = $prop.attr('href');
} else if ($prop.is('img')) {
propValue = $prop.attr('src');
} else if ($prop.is('time')) {
propValue = $prop.attr('datetime') || $prop.text().trim();
} else if ($prop.attr('itemscope')) {
// Handle nested microdata
propValue = extractNestedMicrodata($prop);
} else {
propValue = $prop.text().trim();
}
// Handle multiple properties with the same name
if (item.properties[propName]) {
if (!Array.isArray(item.properties[propName])) {
item.properties[propName] = [item.properties[propName]];
}
item.properties[propName].push(propValue);
} else {
item.properties[propName] = propValue;
}
});
microdataItems.push(item);
});
return microdataItems;
}
function extractNestedMicrodata($element) {
const nestedItem = {
type: $element.attr('itemtype'),
properties: {}
};
$element.find('[itemprop]').each((index, propElement) => {
const $prop = $(propElement);
const propName = $prop.attr('itemprop');
if (!propName) return;
let propValue;
if ($prop.is('meta')) {
propValue = $prop.attr('content');
} else if ($prop.is('link, a')) {
propValue = $prop.attr('href');
} else if ($prop.is('img')) {
propValue = $prop.attr('src');
} else {
propValue = $prop.text().trim();
}
nestedItem.properties[propName] = propValue;
});
return nestedItem;
}
Extracting RDFa Data
RDFa (Resource Description Framework in Attributes) is another structured data format. Here's how to extract RDFa data:
function extractRdfa($) {
const rdfaItems = [];
$('[typeof]').each((index, element) => {
const $element = $(element);
const item = {
type: $element.attr('typeof'),
properties: {}
};
$element.find('[property]').addBack('[property]').each((propIndex, propElement) => {
const $prop = $(propElement);
const propName = $prop.attr('property');
if (!propName) return;
let propValue;
if ($prop.attr('content')) {
propValue = $prop.attr('content');
} else if ($prop.attr('href')) {
propValue = $prop.attr('href');
} else if ($prop.attr('src')) {
propValue = $prop.attr('src');
} else {
propValue = $prop.text().trim();
}
item.properties[propName] = propValue;
});
rdfaItems.push(item);
});
return rdfaItems;
}
Complete Structured Data Extraction Function
Here's a comprehensive function that extracts all types of structured data:
async function extractAllStructuredData(url) {
const $ = await loadPage(url);
if (!$) return null;
const structuredData = {
jsonLd: extractJsonLd($),
microdata: extractMicrodata($),
rdfa: extractRdfa($),
url: url,
extractedAt: new Date().toISOString()
};
return structuredData;
}
// Usage example
async function main() {
const url = 'https://example.com/product-page';
const data = await extractAllStructuredData(url);
console.log('Extracted structured data:', JSON.stringify(data, null, 2));
// Filter for specific schema types
const products = data.jsonLd.filter(item => item['@type'] === 'Product');
console.log('Products found:', products);
}
main().catch(console.error);
Error Handling and Best Practices
When extracting structured data, it's important to implement robust error handling:
function safeExtractStructuredData($) {
const results = {
jsonLd: [],
microdata: [],
rdfa: [],
errors: []
};
try {
results.jsonLd = extractJsonLd($);
} catch (error) {
results.errors.push({ type: 'jsonLd', error: error.message });
}
try {
results.microdata = extractMicrodata($);
} catch (error) {
results.errors.push({ type: 'microdata', error: error.message });
}
try {
results.rdfa = extractRdfa($);
} catch (error) {
results.errors.push({ type: 'rdfa', error: error.message });
}
return results;
}
Performance Optimization Tips
- Selective Parsing: If you only need specific types of structured data, use targeted selectors
- Caching: Cache extracted data when processing multiple pages
- Parallel Processing: Use Promise.all() for processing multiple URLs
async function extractFromMultipleUrls(urls) {
const promises = urls.map(url => extractAllStructuredData(url));
const results = await Promise.all(promises);
return results;
}
Working with Dynamic Content
For pages with JavaScript-rendered content, Cheerio alone might not be sufficient. In such cases, you might need to use tools like Puppeteer for handling dynamic content before applying Cheerio for structured data extraction.
Validation and Schema Verification
After extracting structured data, you should validate it against schema.org specifications:
function validateStructuredData(data) {
const validation = {
isValid: true,
errors: [],
warnings: []
};
if (data.jsonLd) {
data.jsonLd.forEach((item, index) => {
if (!item['@context'] || !item['@type']) {
validation.isValid = false;
validation.errors.push(`JSON-LD item ${index} missing @context or @type`);
}
});
}
return validation;
}
Working with Web Scraping APIs
For production applications, consider using specialized web scraping APIs that can handle both static and dynamic content extraction. These services often provide built-in structured data extraction capabilities and handle complex scenarios like browser session management automatically.
Python Alternative
If you prefer Python, here's an equivalent approach using BeautifulSoup:
from bs4 import BeautifulSoup
import json
import requests
def extract_json_ld(soup):
json_ld_data = []
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
try:
data = json.loads(script.string)
json_ld_data.append(data)
except json.JSONDecodeError as e:
print(f"Error parsing JSON-LD: {e}")
return json_ld_data
def extract_microdata(soup):
microdata_items = []
items = soup.find_all(attrs={'itemscope': True})
for item in items:
microdata_item = {
'type': item.get('itemtype'),
'properties': {}
}
props = item.find_all(attrs={'itemprop': True})
for prop in props:
prop_name = prop.get('itemprop')
if prop.name == 'meta':
prop_value = prop.get('content')
elif prop.name in ['a', 'link']:
prop_value = prop.get('href')
elif prop.name == 'img':
prop_value = prop.get('src')
else:
prop_value = prop.get_text(strip=True)
microdata_item['properties'][prop_name] = prop_value
microdata_items.append(microdata_item)
return microdata_items
# Usage
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
json_ld = extract_json_ld(soup)
microdata = extract_microdata(soup)
print("JSON-LD:", json_ld)
print("Microdata:", microdata)
Conclusion
Extracting structured data with Cheerio is a powerful technique for web scraping and SEO analysis. By combining JSON-LD, microdata, and RDFa extraction methods, you can capture comprehensive structured information from web pages. Remember to handle errors gracefully, validate extracted data, and consider using browser automation tools for JavaScript-heavy sites.
The techniques outlined in this guide will help you build robust scrapers that can extract valuable structured data for various applications, from SEO auditing to competitive analysis and content enrichment.