To extract all links from a webpage using Cheerio, you need to fetch the HTML content first, then use Cheerio's jQuery-like selectors to find anchor tags and extract their href
attributes.
Installation
Install the required packages using npm:
npm install axios cheerio
Basic Link Extraction
Here's a simple function to extract all links from a webpage:
const axios = require('axios');
const cheerio = require('cheerio');
const extractLinks = async (url) => {
try {
const response = await axios.get(url);
const $ = cheerio.load(response.data);
const links = [];
$('a[href]').each((index, element) => {
const href = $(element).attr('href');
if (href && href.trim()) {
links.push(href);
}
});
return links;
} catch (error) {
console.error('Error extracting links:', error.message);
return [];
}
};
// Usage
extractLinks('https://example.com')
.then(links => console.log('Found links:', links));
Enhanced Link Extraction with Additional Data
Extract links along with their text content and other attributes:
const extractDetailedLinks = async (url) => {
try {
const response = await axios.get(url);
const $ = cheerio.load(response.data);
const links = [];
$('a[href]').each((index, element) => {
const $link = $(element);
const href = $link.attr('href');
if (href && href.trim()) {
links.push({
url: href,
text: $link.text().trim(),
title: $link.attr('title') || '',
target: $link.attr('target') || ''
});
}
});
return links;
} catch (error) {
console.error('Error extracting detailed links:', error.message);
return [];
}
};
Handling Relative URLs
Convert relative URLs to absolute URLs using the URL constructor:
const { URL } = require('url');
const extractAbsoluteLinks = async (baseUrl) => {
try {
const response = await axios.get(baseUrl);
const $ = cheerio.load(response.data);
const links = [];
$('a[href]').each((index, element) => {
const href = $(element).attr('href');
if (href && href.trim()) {
try {
// Convert relative URLs to absolute
const absoluteUrl = new URL(href, baseUrl).toString();
links.push(absoluteUrl);
} catch (urlError) {
// Skip invalid URLs
console.warn(`Invalid URL: ${href}`);
}
}
});
return links;
} catch (error) {
console.error('Error extracting absolute links:', error.message);
return [];
}
};
Filtering Links by Type
Filter links based on different criteria:
const extractFilteredLinks = async (url) => {
try {
const response = await axios.get(url);
const $ = cheerio.load(response.data);
const result = {
internal: [],
external: [],
email: [],
phone: []
};
$('a[href]').each((index, element) => {
const href = $(element).attr('href');
if (!href || !href.trim()) return;
if (href.startsWith('mailto:')) {
result.email.push(href);
} else if (href.startsWith('tel:')) {
result.phone.push(href);
} else if (href.startsWith('http') && !href.includes(new URL(url).hostname)) {
result.external.push(href);
} else {
result.internal.push(href);
}
});
return result;
} catch (error) {
console.error('Error filtering links:', error.message);
return { internal: [], external: [], email: [], phone: [] };
}
};
Complete Example with Error Handling
A production-ready function with comprehensive error handling:
const axios = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');
const scrapeLinks = async (targetUrl, options = {}) => {
const {
includeText = false,
makeAbsolute = true,
filterDuplicates = true,
timeout = 10000
} = options;
try {
const response = await axios.get(targetUrl, {
timeout,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; LinkExtractor/1.0)'
}
});
const $ = cheerio.load(response.data);
const links = new Set(); // Use Set to avoid duplicates
const results = [];
$('a[href]').each((index, element) => {
const $element = $(element);
const href = $element.attr('href');
if (!href || !href.trim()) return;
let processedUrl = href;
// Convert to absolute URL if requested
if (makeAbsolute) {
try {
processedUrl = new URL(href, targetUrl).toString();
} catch (urlError) {
console.warn(`Skipping invalid URL: ${href}`);
return;
}
}
// Skip duplicates if filtering is enabled
if (filterDuplicates && links.has(processedUrl)) return;
links.add(processedUrl);
const linkData = { url: processedUrl };
if (includeText) {
linkData.text = $element.text().trim();
linkData.title = $element.attr('title') || '';
}
results.push(linkData);
});
return results;
} catch (error) {
if (error.code === 'ENOTFOUND') {
throw new Error(`Domain not found: ${targetUrl}`);
} else if (error.code === 'ECONNREFUSED') {
throw new Error(`Connection refused: ${targetUrl}`);
} else if (error.response) {
throw new Error(`HTTP ${error.response.status}: ${error.response.statusText}`);
} else {
throw new Error(`Failed to scrape links: ${error.message}`);
}
}
};
// Usage examples
(async () => {
try {
// Basic usage
const basicLinks = await scrapeLinks('https://example.com');
console.log('Basic links:', basicLinks);
// With options
const detailedLinks = await scrapeLinks('https://example.com', {
includeText: true,
makeAbsolute: true,
filterDuplicates: true
});
console.log('Detailed links:', detailedLinks);
} catch (error) {
console.error('Scraping failed:', error.message);
}
})();
Best Practices
- Respect robots.txt: Always check the website's robots.txt file before scraping
- Rate limiting: Add delays between requests to avoid overwhelming servers
- User-Agent: Set a proper User-Agent header to identify your scraper
- Error handling: Implement robust error handling for network issues and invalid URLs
- URL validation: Always validate URLs before processing them
- Duplicate filtering: Remove duplicate links to improve efficiency
Limitations
- JavaScript-rendered content: Cheerio can't execute JavaScript, so it won't capture links added dynamically
- Authentication: Doesn't handle login-protected pages automatically
- CORS restrictions: May face CORS issues when running in browsers
- Large pages: Memory usage can be high for very large HTML documents
For JavaScript-heavy websites, consider using Puppeteer or Playwright instead of Cheerio.