Relative URLs are one of the most common challenges when scraping websites with Cheerio. They appear as partial paths like /products/123
, ../images/photo.jpg
, or contact.html
that need to be converted to full URLs for proper handling.
Understanding Relative URLs
Relative URLs come in several forms:
- Path-relative: /products/123
(relative to domain root)
- Document-relative: contact.html
(relative to current page)
- Parent-relative: ../images/photo.jpg
(relative to parent directory)
- Protocol-relative: //cdn.example.com/file.js
(inherits current protocol)
Method 1: Using Node.js URL Module (Recommended)
The most reliable approach uses Node.js's built-in url
module:
npm install cheerio axios
const cheerio = require('cheerio');
const axios = require('axios');
const { URL } = require('url');
const scrapeWithAbsoluteUrls = async (targetUrl) => {
try {
const response = await axios.get(targetUrl);
const $ = cheerio.load(response.data);
const baseUrl = new URL(targetUrl);
// Extract and convert all links
const links = [];
$('a[href]').each((index, element) => {
const href = $(element).attr('href');
try {
// Create absolute URL using URL constructor
const absoluteUrl = new URL(href, baseUrl).href;
links.push({
text: $(element).text().trim(),
url: absoluteUrl,
isExternal: !absoluteUrl.includes(baseUrl.hostname)
});
} catch (error) {
console.warn(`Invalid URL: ${href}`);
}
});
return links;
} catch (error) {
console.error('Scraping failed:', error.message);
return [];
}
};
// Usage
scrapeWithAbsoluteUrls('https://example.com/products')
.then(links => console.log('Found links:', links));
Method 2: Legacy url.resolve() Approach
For older Node.js versions or existing codebases:
const cheerio = require('cheerio');
const axios = require('axios');
const url = require('url');
const convertRelativeUrls = (html, baseUrl) => {
const $ = cheerio.load(html);
// Handle links
$('a[href]').each((index, element) => {
const href = $(element).attr('href');
if (href && !isAbsoluteUrl(href)) {
$(element).attr('href', url.resolve(baseUrl, href));
}
});
// Handle images
$('img[src]').each((index, element) => {
const src = $(element).attr('src');
if (src && !isAbsoluteUrl(src)) {
$(element).attr('src', url.resolve(baseUrl, src));
}
});
return $.html();
};
const isAbsoluteUrl = (urlString) => {
return /^https?:\/\//.test(urlString);
};
Method 3: Comprehensive Resource Handler
For handling multiple resource types (images, stylesheets, scripts):
const handleAllResources = async (targetUrl) => {
const response = await axios.get(targetUrl);
const $ = cheerio.load(response.data);
const baseUrl = new URL(targetUrl);
const resources = {
links: [],
images: [],
stylesheets: [],
scripts: []
};
// Process different resource types
const selectors = {
links: 'a[href]',
images: 'img[src]',
stylesheets: 'link[rel="stylesheet"][href]',
scripts: 'script[src]'
};
Object.entries(selectors).forEach(([type, selector]) => {
$(selector).each((index, element) => {
const attrName = type === 'links' ? 'href' : 'src';
const url = $(element).attr(attrName);
if (url) {
try {
const absoluteUrl = new URL(url, baseUrl).href;
resources[type].push({
original: url,
absolute: absoluteUrl,
element: $(element).toString()
});
} catch (error) {
console.warn(`Invalid ${type} URL: ${url}`);
}
}
});
});
return resources;
};
Method 4: Utility Class for Complex Projects
For larger projects, create a reusable utility:
class UrlHandler {
constructor(baseUrl) {
this.baseUrl = new URL(baseUrl);
}
toAbsolute(relativeUrl) {
try {
return new URL(relativeUrl, this.baseUrl).href;
} catch (error) {
throw new Error(`Cannot convert URL: ${relativeUrl}`);
}
}
isExternal(url) {
try {
const urlObj = new URL(url, this.baseUrl);
return urlObj.hostname !== this.baseUrl.hostname;
} catch (error) {
return false;
}
}
isSameDomain(url) {
return !this.isExternal(url);
}
processCheerioElement($, element, attribute = 'href') {
const originalUrl = $(element).attr(attribute);
if (originalUrl && !this.isAbsolute(originalUrl)) {
const absoluteUrl = this.toAbsolute(originalUrl);
$(element).attr(attribute, absoluteUrl);
return absoluteUrl;
}
return originalUrl;
}
isAbsolute(url) {
return /^https?:\/\//.test(url);
}
}
// Usage
const urlHandler = new UrlHandler('https://example.com/products/');
const response = await axios.get('https://example.com/products/category-1');
const $ = cheerio.load(response.data);
$('a[href]').each((index, element) => {
const absoluteUrl = urlHandler.processCheerioElement($, element, 'href');
console.log('Processed URL:', absoluteUrl);
});
Common Edge Cases and Solutions
Handling Base Tags
const getBaseUrl = ($, originalUrl) => {
const baseTag = $('base[href]').attr('href');
if (baseTag) {
return new URL(baseTag, originalUrl).href;
}
return originalUrl;
};
Protocol-Relative URLs
const handleProtocolRelative = (url, baseUrl) => {
if (url.startsWith('//')) {
return new URL(baseUrl).protocol + url;
}
return url;
};
Hash and Query Parameters
const preserveFragments = (originalUrl, convertedUrl, preserveHash = true) => {
if (preserveHash && originalUrl.includes('#')) {
const hash = originalUrl.split('#')[1];
return convertedUrl + '#' + hash;
}
return convertedUrl;
};
Best Practices
- Always validate URLs before processing to avoid errors
- Use try-catch blocks when converting URLs
- Consider base tags in HTML that might change the base URL
- Handle different protocols (http, https, ftp, mailto)
- Preserve fragments and query parameters when needed
- Cache base URLs for performance in large scraping operations
Error Handling Example
const safeUrlConversion = (relativeUrl, baseUrl) => {
try {
// Handle empty or null URLs
if (!relativeUrl || relativeUrl.trim() === '') {
return null;
}
// Handle anchor links
if (relativeUrl.startsWith('#')) {
return baseUrl + relativeUrl;
}
// Handle protocol-relative URLs
if (relativeUrl.startsWith('//')) {
return new URL(baseUrl).protocol + relativeUrl;
}
// Convert relative to absolute
return new URL(relativeUrl, baseUrl).href;
} catch (error) {
console.warn(`URL conversion failed for "${relativeUrl}":`, error.message);
return null;
}
};
By using these approaches, you can reliably handle all types of relative URLs in your Cheerio-based web scraping projects while maintaining code clarity and error resilience.