When web scraping with JavaScript, handling redirects is crucial for dealing with dynamic web behavior. HTTP redirects occur when a server responds with status codes (301, 302, 307, 308) that instruct the client to fetch content from a different URL.
Understanding HTTP Redirects
Common redirect status codes: - 301 - Moved Permanently (permanent redirect) - 302 - Found (temporary redirect) - 307 - Temporary Redirect (preserves request method) - 308 - Permanent Redirect (preserves request method)
Most HTTP libraries handle redirects automatically, but you may need manual control for logging, custom logic, or security reasons.
1. Using Axios (Node.js)
Axios follows redirects automatically by default, making it ideal for most scraping scenarios.
Automatic Redirect Following
const axios = require('axios');
async function scrapeWithRedirects(url) {
try {
const response = await axios.get(url, {
maxRedirects: 5, // Limit redirect chain length
timeout: 10000, // 10 second timeout
validateStatus: (status) => status >= 200 && status < 400
});
console.log('Final URL:', response.request.res.responseUrl);
console.log('Status:', response.status);
console.log('Redirect count:', response.request._redirectable._redirectCount);
return response.data;
} catch (error) {
console.error('Scraping failed:', error.message);
throw error;
}
}
scrapeWithRedirects('https://bit.ly/example-link');
Manual Redirect Handling
const axios = require('axios');
async function handleRedirectsManually(url, maxRedirects = 5) {
let currentUrl = url;
let redirectCount = 0;
const redirectChain = [url];
while (redirectCount < maxRedirects) {
try {
const response = await axios.get(currentUrl, {
maxRedirects: 0, // Disable automatic redirects
validateStatus: (status) => status >= 200 && status < 400
});
// Success - no redirect
console.log('Redirect chain:', redirectChain);
return { data: response.data, finalUrl: currentUrl, redirects: redirectChain };
} catch (error) {
if (error.response && error.response.status >= 300 && error.response.status < 400) {
// Handle redirect
const redirectUrl = error.response.headers.location;
console.log(`Redirect ${redirectCount + 1}: ${currentUrl} -> ${redirectUrl}`);
currentUrl = new URL(redirectUrl, currentUrl).href; // Handle relative URLs
redirectChain.push(currentUrl);
redirectCount++;
} else {
throw error;
}
}
}
throw new Error(`Too many redirects (>${maxRedirects})`);
}
2. Using Fetch API (Browser & Node.js)
The Fetch API is available in modern browsers and Node.js (18+), offering native redirect handling.
Automatic Redirect Following
async function scrapeWithFetch(url) {
try {
const response = await fetch(url, {
redirect: 'follow', // Default: follow redirects automatically
// redirect: 'error', // Throw error on redirect
// redirect: 'manual' // Handle redirects manually
});
if (response.redirected) {
console.log('Original URL:', url);
console.log('Final URL:', response.url);
}
const html = await response.text();
return { html, finalUrl: response.url, wasRedirected: response.redirected };
} catch (error) {
console.error('Fetch failed:', error.message);
throw error;
}
}
// Usage
scrapeWithFetch('https://example.com/redirect-page')
.then(result => {
console.log('Scraped successfully:', result.finalUrl);
});
Manual Redirect Handling
async function handleFetchRedirectsManually(url, maxRedirects = 5) {
let currentUrl = url;
let redirectCount = 0;
const redirectChain = [url];
while (redirectCount < maxRedirects) {
const response = await fetch(currentUrl, {
redirect: 'manual' // Don't follow redirects automatically
});
if (response.type === 'opaqueredirect') {
// Browser security prevents access to redirect location
throw new Error('Cannot access redirect location due to CORS policy');
}
if (response.status >= 300 && response.status < 400) {
const redirectUrl = response.headers.get('location');
if (!redirectUrl) {
throw new Error('Redirect response missing Location header');
}
currentUrl = new URL(redirectUrl, currentUrl).href;
redirectChain.push(currentUrl);
redirectCount++;
console.log(`Redirect ${redirectCount}: -> ${currentUrl}`);
} else {
// Success or error
const html = await response.text();
return {
html,
finalUrl: currentUrl,
redirects: redirectChain,
status: response.status
};
}
}
throw new Error(`Too many redirects (>${maxRedirects})`);
}
3. Using Other HTTP Clients
Node-fetch (Legacy)
const fetch = require('node-fetch');
async function scrapeWithNodeFetch(url) {
const response = await fetch(url, {
follow: 20, // Maximum redirects to follow
compress: true, // Enable gzip compression
timeout: 10000 // Request timeout
});
console.log('Final URL:', response.url);
console.log('Redirected:', response.redirected);
return await response.text();
}
Got Library
const got = require('got');
async function scrapeWithGot(url) {
try {
const response = await got(url, {
followRedirect: true,
maxRedirects: 10,
timeout: { request: 10000 }
});
console.log('Final URL:', response.url);
console.log('Redirect count:', response.redirectUrls.length);
console.log('Redirect chain:', response.redirectUrls);
return response.body;
} catch (error) {
if (error.response) {
console.log('HTTP Error:', error.response.statusCode);
}
throw error;
}
}
Best Practices
1. Handle Infinite Redirects
async function safeRedirectHandler(url, maxRedirects = 10) {
const visited = new Set();
let currentUrl = url;
let redirectCount = 0;
while (redirectCount < maxRedirects) {
if (visited.has(currentUrl)) {
throw new Error('Infinite redirect loop detected');
}
visited.add(currentUrl);
// Your redirect logic here
break; // Replace with actual redirect handling
}
}
2. Preserve Request Context
async function scrapeWithContext(url, options = {}) {
const { headers = {}, cookies = '', maxRedirects = 5 } = options;
const scrapeOptions = {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
...headers
},
maxRedirects,
timeout: 10000
};
if (cookies) {
scrapeOptions.headers['Cookie'] = cookies;
}
return await axios.get(url, scrapeOptions);
}
3. Log Redirect Chains
function createRedirectLogger() {
return {
redirects: [],
logRedirect(from, to, status) {
this.redirects.push({ from, to, status, timestamp: new Date() });
console.log(`Redirect: ${from} -> ${to} (${status})`);
},
getRedirectChain() {
return this.redirects;
}
};
}
Security Considerations
- Validate redirect URLs to prevent server-side request forgery (SSRF)
- Set reasonable redirect limits to avoid infinite loops
- Be cautious with sensitive headers that might leak during redirects
- Check final URLs to ensure they match expected domains
Common Issues
- CORS restrictions in browsers may hide redirect locations
- Relative URLs in redirect headers need proper resolution
- Authentication tokens may not persist across redirects
- HTTPS to HTTP redirects may be blocked by browsers
Always respect robots.txt, rate limits, and website terms of service when implementing redirect handling in your web scraping projects.