How to Extract Data from Embedded JSON-LD Scripts
JSON-LD (JSON for Linking Data) is a widely used format for embedding structured data directly into web pages. This structured data provides valuable information about the page content, including product details, reviews, articles, events, and much more. Extracting this data is often more reliable than scraping visible HTML content since it's specifically designed for machine consumption.
What is JSON-LD?
JSON-LD is a method of encoding linked data using JSON. It's commonly found in <script>
tags with the type application/ld+json
and contains structured data about the webpage content. Major websites like e-commerce platforms, news sites, and blogs use JSON-LD to provide rich snippets to search engines.
Example JSON-LD script:
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"name": "Example Product",
"image": "https://example.com/product.jpg",
"description": "A great product for developers",
"sku": "12345",
"offers": {
"@type": "Offer",
"priceCurrency": "USD",
"price": "29.99",
"availability": "https://schema.org/InStock"
}
}
</script>
Extracting JSON-LD with Simple HTML DOM (PHP)
Simple HTML DOM Parser is a popular PHP library for parsing HTML. Here's how to extract JSON-LD data:
Basic Extraction
<?php
require_once('simple_html_dom.php');
// Load the HTML content
$html = file_get_html('https://example.com/product-page');
// Find all script tags with JSON-LD type
$jsonLdScripts = $html->find('script[type="application/ld+json"]');
foreach ($jsonLdScripts as $script) {
// Get the JSON content
$jsonContent = $script->innertext;
// Decode JSON
$data = json_decode($jsonContent, true);
if ($data) {
// Process the structured data
echo "Type: " . $data['@type'] . "\n";
if (isset($data['name'])) {
echo "Name: " . $data['name'] . "\n";
}
if (isset($data['offers']['price'])) {
echo "Price: " . $data['offers']['price'] . "\n";
}
}
}
// Clean up
$html->clear();
?>
Advanced JSON-LD Processing
<?php
function extractJsonLdData($url) {
$html = file_get_html($url);
if (!$html) {
return false;
}
$allJsonLdData = [];
$jsonLdScripts = $html->find('script[type="application/ld+json"]');
foreach ($jsonLdScripts as $script) {
$jsonContent = trim($script->innertext);
// Handle potential JSON formatting issues
$jsonContent = preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $jsonContent);
$data = json_decode($jsonContent, true);
if (json_last_error() === JSON_ERROR_NONE && $data) {
$allJsonLdData[] = $data;
}
}
$html->clear();
return $allJsonLdData;
}
// Usage
$structuredData = extractJsonLdData('https://example.com');
foreach ($structuredData as $data) {
switch ($data['@type']) {
case 'Product':
processProductData($data);
break;
case 'Article':
processArticleData($data);
break;
case 'Review':
processReviewData($data);
break;
}
}
function processProductData($product) {
echo "Product: " . ($product['name'] ?? 'N/A') . "\n";
echo "Description: " . ($product['description'] ?? 'N/A') . "\n";
echo "Price: " . ($product['offers']['price'] ?? 'N/A') . "\n";
echo "Currency: " . ($product['offers']['priceCurrency'] ?? 'N/A') . "\n";
}
?>
Python Implementation
Using BeautifulSoup
import json
import requests
from bs4 import BeautifulSoup
def extract_json_ld(url):
"""Extract all JSON-LD data from a webpage."""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all script tags with JSON-LD type
json_scripts = soup.find_all('script', type='application/ld+json')
json_ld_data = []
for script in json_scripts:
try:
# Parse JSON content
data = json.loads(script.string)
json_ld_data.append(data)
except (json.JSONDecodeError, TypeError) as e:
print(f"Error parsing JSON-LD: {e}")
continue
return json_ld_data
# Usage example
url = "https://example.com/product"
structured_data = extract_json_ld(url)
for data in structured_data:
if data.get('@type') == 'Product':
print(f"Product: {data.get('name', 'N/A')}")
print(f"Price: {data.get('offers', {}).get('price', 'N/A')}")
print(f"Availability: {data.get('offers', {}).get('availability', 'N/A')}")
Advanced Python Processing
import json
import re
from typing import List, Dict, Any
from bs4 import BeautifulSoup
import requests
class JsonLdExtractor:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def extract_from_url(self, url: str) -> List[Dict[Any, Any]]:
"""Extract JSON-LD data from a URL."""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return self.extract_from_html(response.text)
except requests.RequestException as e:
print(f"Error fetching URL: {e}")
return []
def extract_from_html(self, html_content: str) -> List[Dict[Any, Any]]:
"""Extract JSON-LD data from HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
json_ld_data = []
# Find all JSON-LD scripts
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
if script.string:
try:
# Clean the JSON string
json_str = self._clean_json_string(script.string)
data = json.loads(json_str)
# Handle arrays of JSON-LD objects
if isinstance(data, list):
json_ld_data.extend(data)
else:
json_ld_data.append(data)
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
continue
return json_ld_data
def _clean_json_string(self, json_str: str) -> str:
"""Clean JSON string from potential formatting issues."""
# Remove HTML entities
json_str = json_str.replace('<', '<').replace('>', '>')
json_str = json_str.replace('&', '&').replace('"', '"')
# Remove control characters
json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', json_str)
return json_str.strip()
def filter_by_type(self, data: List[Dict], schema_type: str) -> List[Dict]:
"""Filter JSON-LD data by @type."""
return [item for item in data if item.get('@type') == schema_type]
def extract_product_info(self, data: Dict) -> Dict:
"""Extract product information from JSON-LD data."""
if data.get('@type') != 'Product':
return {}
product_info = {
'name': data.get('name'),
'description': data.get('description'),
'sku': data.get('sku'),
'brand': data.get('brand', {}).get('name') if isinstance(data.get('brand'), dict) else data.get('brand'),
'image': data.get('image'),
}
# Extract offer information
offers = data.get('offers', {})
if offers:
if isinstance(offers, list):
offers = offers[0] # Take first offer
product_info.update({
'price': offers.get('price'),
'currency': offers.get('priceCurrency'),
'availability': offers.get('availability'),
'url': offers.get('url')
})
return {k: v for k, v in product_info.items() if v is not None}
# Usage
extractor = JsonLdExtractor()
json_ld_data = extractor.extract_from_url('https://example.com/product')
# Filter and process product data
products = extractor.filter_by_type(json_ld_data, 'Product')
for product_data in products:
product_info = extractor.extract_product_info(product_data)
print(json.dumps(product_info, indent=2))
JavaScript Implementation
Browser-based Extraction
function extractJsonLd() {
// Find all JSON-LD script tags
const jsonLdScripts = document.querySelectorAll('script[type="application/ld+json"]');
const jsonLdData = [];
jsonLdScripts.forEach(script => {
try {
const data = JSON.parse(script.textContent);
// Handle arrays of JSON-LD objects
if (Array.isArray(data)) {
jsonLdData.push(...data);
} else {
jsonLdData.push(data);
}
} catch (error) {
console.error('Error parsing JSON-LD:', error);
}
});
return jsonLdData;
}
// Usage
const structuredData = extractJsonLd();
console.log('Found JSON-LD data:', structuredData);
// Filter by type
const products = structuredData.filter(item => item['@type'] === 'Product');
products.forEach(product => {
console.log('Product:', product.name);
console.log('Price:', product.offers?.price);
});
Node.js with Puppeteer
For dynamic content or when handling JavaScript-heavy websites, you can use Puppeteer:
const puppeteer = require('puppeteer');
async function extractJsonLdWithPuppeteer(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
try {
await page.goto(url, { waitUntil: 'networkidle0' });
// Extract JSON-LD data
const jsonLdData = await page.evaluate(() => {
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
const data = [];
scripts.forEach(script => {
try {
const parsed = JSON.parse(script.textContent);
if (Array.isArray(parsed)) {
data.push(...parsed);
} else {
data.push(parsed);
}
} catch (error) {
console.error('JSON parsing error:', error);
}
});
return data;
});
return jsonLdData;
} finally {
await browser.close();
}
}
// Usage
(async () => {
const data = await extractJsonLdWithPuppeteer('https://example.com');
console.log('Extracted data:', JSON.stringify(data, null, 2));
})();
Using Command Line Tools
jq for JSON Processing
After extracting JSON-LD with curl and grep:
# Extract JSON-LD from a webpage
curl -s "https://example.com" | grep -o '<script type="application/ld+json">[^<]*</script>' | sed 's/<script[^>]*>//g' | sed 's/<\/script>//g' | jq '.'
# Extract specific product information
curl -s "https://example.com" | grep -o '<script type="application/ld+json">[^<]*</script>' | sed 's/<script[^>]*>//g' | sed 's/<\/script>//g' | jq 'select(.["@type"] == "Product") | {name: .name, price: .offers.price}'
Best Practices and Tips
Error Handling
Always implement robust error handling when parsing JSON-LD:
def safe_json_parse(json_string):
try:
return json.loads(json_string)
except json.JSONDecodeError as e:
# Try to fix common issues
cleaned = json_string.strip()
cleaned = re.sub(r',\s*}', '}', cleaned) # Remove trailing commas
cleaned = re.sub(r',\s*]', ']', cleaned)
try:
return json.loads(cleaned)
except json.JSONDecodeError:
print(f"Unable to parse JSON: {e}")
return None
Data Validation
Validate the extracted data structure:
def validate_product_data(product):
required_fields = ['@type', 'name']
for field in required_fields:
if field not in product:
return False
if product['@type'] != 'Product':
return False
return True
Performance Considerations
- Cache parsed JSON-LD data to avoid repeated parsing
- Use appropriate request headers to avoid being blocked
- Implement rate limiting for large-scale scraping
- Consider using headless browsers for dynamic content
Common JSON-LD Types
- Product: E-commerce product information
- Article: News articles and blog posts
- Review: User reviews and ratings
- Event: Events and activities
- Organization: Company and organization data
- Person: Individual profiles
- Recipe: Cooking recipes
- Movie: Film information
Conclusion
Extracting data from embedded JSON-LD scripts is an efficient way to gather structured information from websites. Whether you're using Simple HTML DOM in PHP, BeautifulSoup in Python, or browser-based JavaScript, the key is to properly locate the script tags, parse the JSON content safely, and handle errors gracefully. JSON-LD provides a reliable alternative to traditional HTML scraping and often contains richer, more structured data than what's visible on the page.
Remember to respect robots.txt files and implement appropriate rate limiting when scraping at scale. For complex scenarios involving dynamic content, consider using advanced browser automation tools that can handle JavaScript-rendered content effectively.