How do you use Cheerio to extract data from CSS stylesheets?
Cheerio provides several methods to extract CSS-related data from HTML documents, including inline styles, external stylesheet references, and computed style properties. While Cheerio doesn't execute JavaScript like browser automation tools, it excels at parsing static HTML and extracting CSS information efficiently.
Understanding CSS Data Extraction with Cheerio
Cheerio can extract CSS data from three main sources:
- Inline styles - CSS defined directly in HTML elements via the
style
attribute - Internal stylesheets - CSS defined within
<style>
tags in the HTML document - External stylesheet references - Links to external CSS files via
<link>
tags
Extracting Inline Styles
Basic Inline Style Extraction
The most straightforward approach is extracting the style
attribute from HTML elements:
const cheerio = require('cheerio');
const html = `
<div class="container">
<p style="color: red; font-size: 16px; margin: 10px;">Styled paragraph</p>
<div style="background-color: blue; height: 100px; width: 200px;">Blue box</div>
<span style="font-weight: bold; text-decoration: underline;">Bold text</span>
</div>
`;
const $ = cheerio.load(html);
// Extract all inline styles
$('[style]').each((index, element) => {
const tag = $(element).prop('tagName').toLowerCase();
const styles = $(element).attr('style');
console.log(`${tag}: ${styles}`);
});
// Output:
// p: color: red; font-size: 16px; margin: 10px;
// div: background-color: blue; height: 100px; width: 200px;
// span: font-weight: bold; text-decoration: underline;
Parsing Individual Style Properties
To extract specific CSS properties, you can parse the style attribute:
function parseStyleAttribute(styleString) {
const styles = {};
if (!styleString) return styles;
styleString.split(';').forEach(declaration => {
const [property, value] = declaration.split(':').map(s => s.trim());
if (property && value) {
styles[property] = value;
}
});
return styles;
}
const $ = cheerio.load(html);
$('[style]').each((index, element) => {
const styleAttr = $(element).attr('style');
const parsedStyles = parseStyleAttribute(styleAttr);
console.log('Element:', $(element).prop('tagName').toLowerCase());
console.log('Parsed styles:', parsedStyles);
console.log('---');
});
Extracting Specific CSS Properties
You can create utility functions to extract specific CSS properties:
function getCSSProperty(element, property) {
const styleAttr = $(element).attr('style');
if (!styleAttr) return null;
const styles = parseStyleAttribute(styleAttr);
return styles[property] || null;
}
const $ = cheerio.load(html);
// Extract specific properties
$('[style]').each((index, element) => {
const color = getCSSProperty(element, 'color');
const backgroundColor = getCSSProperty(element, 'background-color');
const fontSize = getCSSProperty(element, 'font-size');
if (color || backgroundColor || fontSize) {
console.log({
tag: $(element).prop('tagName').toLowerCase(),
color,
backgroundColor,
fontSize
});
}
});
Extracting Internal Stylesheets
Internal CSS defined within <style>
tags can be extracted and parsed:
const htmlWithStyles = `
<!DOCTYPE html>
<html>
<head>
<style>
.header {
background-color: #333;
color: white;
padding: 20px;
}
.content {
margin: 15px;
font-family: Arial, sans-serif;
}
#main-title {
font-size: 24px;
font-weight: bold;
}
</style>
</head>
<body>
<div class="header">Header</div>
<div class="content">Content</div>
</body>
</html>
`;
const $ = cheerio.load(htmlWithStyles);
// Extract all internal stylesheets
$('style').each((index, element) => {
const cssContent = $(element).html();
console.log(`Internal stylesheet ${index + 1}:`);
console.log(cssContent);
console.log('---');
});
Parsing CSS Rules from Internal Stylesheets
For more advanced CSS parsing, you can extract and analyze CSS rules:
function parseCSS(cssText) {
const rules = [];
const ruleRegex = /([^{]+){([^}]+)}/g;
let match;
while ((match = ruleRegex.exec(cssText)) !== null) {
const selector = match[1].trim();
const declarations = match[2].trim();
const properties = {};
declarations.split(';').forEach(declaration => {
const [property, value] = declaration.split(':').map(s => s.trim());
if (property && value) {
properties[property] = value;
}
});
rules.push({ selector, properties });
}
return rules;
}
const $ = cheerio.load(htmlWithStyles);
$('style').each((index, element) => {
const cssContent = $(element).html();
const parsedRules = parseCSS(cssContent);
console.log(`Stylesheet ${index + 1} rules:`);
parsedRules.forEach(rule => {
console.log(`Selector: ${rule.selector}`);
console.log('Properties:', rule.properties);
console.log('---');
});
});
Extracting External Stylesheet References
Cheerio can extract references to external CSS files:
const htmlWithLinks = `
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto">
<link rel="stylesheet" href="/css/responsive.css" media="screen and (max-width: 768px)">
</head>
<body>
<h1>Page with external stylesheets</h1>
</body>
</html>
`;
const $ = cheerio.load(htmlWithLinks);
// Extract all stylesheet links
const stylesheets = [];
$('link[rel="stylesheet"]').each((index, element) => {
const href = $(element).attr('href');
const media = $(element).attr('media') || 'all';
stylesheets.push({ href, media });
});
console.log('External stylesheets:', stylesheets);
Advanced CSS Data Extraction
Extracting CSS Custom Properties (CSS Variables)
function extractCSSVariables(cssText) {
const variables = {};
const variableRegex = /--([\w-]+):\s*([^;]+);/g;
let match;
while ((match = variableRegex.exec(cssText)) !== null) {
variables[`--${match[1]}`] = match[2].trim();
}
return variables;
}
const cssWithVariables = `
<style>
:root {
--primary-color: #007bff;
--secondary-color: #6c757d;
--border-radius: 4px;
--font-size-base: 16px;
}
.button {
background-color: var(--primary-color);
border-radius: var(--border-radius);
}
</style>
`;
const $ = cheerio.load(cssWithVariables);
const cssContent = $('style').html();
const cssVariables = extractCSSVariables(cssContent);
console.log('CSS Variables:', cssVariables);
Building a CSS Analyzer
Here's a comprehensive example that combines multiple extraction techniques:
class CheerioStyleExtractor {
constructor(html) {
this.$ = cheerio.load(html);
}
extractInlineStyles() {
const inlineStyles = [];
this.$('[style]').each((index, element) => {
const $el = this.$(element);
const tag = $el.prop('tagName').toLowerCase();
const id = $el.attr('id');
const classes = $el.attr('class');
const styles = parseStyleAttribute($el.attr('style'));
inlineStyles.push({
tag,
id,
classes,
styles,
selector: this.buildSelector($el)
});
});
return inlineStyles;
}
extractInternalStylesheets() {
const stylesheets = [];
this.$('style').each((index, element) => {
const cssContent = this.$(element).html();
const rules = parseCSS(cssContent);
const variables = extractCSSVariables(cssContent);
stylesheets.push({
index,
rules,
variables,
rawContent: cssContent
});
});
return stylesheets;
}
extractExternalStylesheets() {
const external = [];
this.$('link[rel="stylesheet"]').each((index, element) => {
const $link = this.$(element);
external.push({
href: $link.attr('href'),
media: $link.attr('media') || 'all',
integrity: $link.attr('integrity'),
crossorigin: $link.attr('crossorigin')
});
});
return external;
}
buildSelector(element) {
const tag = element.prop('tagName').toLowerCase();
const id = element.attr('id');
const classes = element.attr('class');
let selector = tag;
if (id) selector += `#${id}`;
if (classes) selector += `.${classes.split(' ').join('.')}`;
return selector;
}
getStyleSummary() {
return {
inlineStyles: this.extractInlineStyles(),
internalStylesheets: this.extractInternalStylesheets(),
externalStylesheets: this.extractExternalStylesheets()
};
}
}
// Usage example
const complexHTML = `
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="/css/main.css">
<style>
:root { --primary: #007bff; }
.header { background: var(--primary); padding: 20px; }
.content { margin: 15px; }
</style>
</head>
<body>
<div class="header" style="color: white; font-weight: bold;">Header</div>
<div class="content" id="main-content" style="background: #f8f9fa;">Content</div>
</body>
</html>
`;
const extractor = new CheerioStyleExtractor(complexHTML);
const styleSummary = extractor.getStyleSummary();
console.log('Complete style analysis:', JSON.stringify(styleSummary, null, 2));
Integration with Web Scraping Workflows
When working with dynamic content that requires JavaScript execution, you might need to combine Cheerio with browser automation tools. For scenarios involving complex styling or JavaScript-heavy websites with dynamic content, consider using Puppeteer first to render the page, then pass the HTML to Cheerio for efficient CSS extraction.
Performance Considerations and Best Practices
Memory Management: When processing large stylesheets, parse and process CSS content incrementally rather than storing everything in memory.
Regex Performance: For complex CSS parsing, consider using dedicated CSS parsing libraries like
css-tree
alongside Cheerio.Caching: Cache parsed CSS rules when processing multiple pages with similar stylesheets.
Error Handling: Always validate CSS content and handle malformed stylesheets gracefully.
function safeStyleExtraction(html) {
try {
const $ = cheerio.load(html);
// Your extraction logic here
return extractedData;
} catch (error) {
console.error('CSS extraction failed:', error.message);
return { error: error.message, extractedData: null };
}
}
Conclusion
Cheerio provides robust capabilities for extracting CSS data from HTML documents, making it an excellent choice for static content analysis. While it doesn't execute JavaScript or compute final styles like browsers do, it efficiently handles inline styles, internal stylesheets, and external CSS references. For comprehensive web scraping projects that require both CSS analysis and DOM element interaction, combining Cheerio with browser automation tools provides the best of both worlds: performance and functionality.
By implementing the techniques and patterns shown in this guide, you can build sophisticated CSS extraction systems that handle various stylesheet formats and provide detailed style analysis for your web scraping projects.