How do I handle redirects while scraping with JavaScript?

When web scraping with JavaScript, handling redirects is crucial for dealing with dynamic web behavior. HTTP redirects occur when a server responds with status codes (301, 302, 307, 308) that instruct the client to fetch content from a different URL.

Understanding HTTP Redirects

Common redirect status codes: - 301 - Moved Permanently (permanent redirect) - 302 - Found (temporary redirect) - 307 - Temporary Redirect (preserves request method) - 308 - Permanent Redirect (preserves request method)

Most HTTP libraries handle redirects automatically, but you may need manual control for logging, custom logic, or security reasons.

1. Using Axios (Node.js)

Axios follows redirects automatically by default, making it ideal for most scraping scenarios.

Automatic Redirect Following

const axios = require('axios');

async function scrapeWithRedirects(url) {
  try {
    const response = await axios.get(url, {
      maxRedirects: 5, // Limit redirect chain length
      timeout: 10000,  // 10 second timeout
      validateStatus: (status) => status >= 200 && status < 400
    });

    console.log('Final URL:', response.request.res.responseUrl);
    console.log('Status:', response.status);
    console.log('Redirect count:', response.request._redirectable._redirectCount);

    return response.data;
  } catch (error) {
    console.error('Scraping failed:', error.message);
    throw error;
  }
}

scrapeWithRedirects('https://bit.ly/example-link');

Manual Redirect Handling

const axios = require('axios');

async function handleRedirectsManually(url, maxRedirects = 5) {
  let currentUrl = url;
  let redirectCount = 0;
  const redirectChain = [url];

  while (redirectCount < maxRedirects) {
    try {
      const response = await axios.get(currentUrl, {
        maxRedirects: 0, // Disable automatic redirects
        validateStatus: (status) => status >= 200 && status < 400
      });

      // Success - no redirect
      console.log('Redirect chain:', redirectChain);
      return { data: response.data, finalUrl: currentUrl, redirects: redirectChain };

    } catch (error) {
      if (error.response && error.response.status >= 300 && error.response.status < 400) {
        // Handle redirect
        const redirectUrl = error.response.headers.location;
        console.log(`Redirect ${redirectCount + 1}: ${currentUrl} -> ${redirectUrl}`);

        currentUrl = new URL(redirectUrl, currentUrl).href; // Handle relative URLs
        redirectChain.push(currentUrl);
        redirectCount++;
      } else {
        throw error;
      }
    }
  }

  throw new Error(`Too many redirects (>${maxRedirects})`);
}

2. Using Fetch API (Browser & Node.js)

The Fetch API is available in modern browsers and Node.js (18+), offering native redirect handling.

Automatic Redirect Following

async function scrapeWithFetch(url) {
  try {
    const response = await fetch(url, {
      redirect: 'follow', // Default: follow redirects automatically
      // redirect: 'error', // Throw error on redirect
      // redirect: 'manual' // Handle redirects manually
    });

    if (response.redirected) {
      console.log('Original URL:', url);
      console.log('Final URL:', response.url);
    }

    const html = await response.text();
    return { html, finalUrl: response.url, wasRedirected: response.redirected };

  } catch (error) {
    console.error('Fetch failed:', error.message);
    throw error;
  }
}

// Usage
scrapeWithFetch('https://example.com/redirect-page')
  .then(result => {
    console.log('Scraped successfully:', result.finalUrl);
  });

Manual Redirect Handling

async function handleFetchRedirectsManually(url, maxRedirects = 5) {
  let currentUrl = url;
  let redirectCount = 0;
  const redirectChain = [url];

  while (redirectCount < maxRedirects) {
    const response = await fetch(currentUrl, {
      redirect: 'manual' // Don't follow redirects automatically
    });

    if (response.type === 'opaqueredirect') {
      // Browser security prevents access to redirect location
      throw new Error('Cannot access redirect location due to CORS policy');
    }

    if (response.status >= 300 && response.status < 400) {
      const redirectUrl = response.headers.get('location');
      if (!redirectUrl) {
        throw new Error('Redirect response missing Location header');
      }

      currentUrl = new URL(redirectUrl, currentUrl).href;
      redirectChain.push(currentUrl);
      redirectCount++;
      console.log(`Redirect ${redirectCount}: -> ${currentUrl}`);
    } else {
      // Success or error
      const html = await response.text();
      return { 
        html, 
        finalUrl: currentUrl, 
        redirects: redirectChain,
        status: response.status 
      };
    }
  }

  throw new Error(`Too many redirects (>${maxRedirects})`);
}

3. Using Other HTTP Clients

Node-fetch (Legacy)

const fetch = require('node-fetch');

async function scrapeWithNodeFetch(url) {
  const response = await fetch(url, {
    follow: 20,     // Maximum redirects to follow
    compress: true, // Enable gzip compression
    timeout: 10000  // Request timeout
  });

  console.log('Final URL:', response.url);
  console.log('Redirected:', response.redirected);

  return await response.text();
}

Got Library

const got = require('got');

async function scrapeWithGot(url) {
  try {
    const response = await got(url, {
      followRedirect: true,
      maxRedirects: 10,
      timeout: { request: 10000 }
    });

    console.log('Final URL:', response.url);
    console.log('Redirect count:', response.redirectUrls.length);
    console.log('Redirect chain:', response.redirectUrls);

    return response.body;
  } catch (error) {
    if (error.response) {
      console.log('HTTP Error:', error.response.statusCode);
    }
    throw error;
  }
}

Best Practices

1. Handle Infinite Redirects

async function safeRedirectHandler(url, maxRedirects = 10) {
  const visited = new Set();
  let currentUrl = url;
  let redirectCount = 0;

  while (redirectCount < maxRedirects) {
    if (visited.has(currentUrl)) {
      throw new Error('Infinite redirect loop detected');
    }
    visited.add(currentUrl);

    // Your redirect logic here
    break; // Replace with actual redirect handling
  }
}

2. Preserve Request Context

async function scrapeWithContext(url, options = {}) {
  const { headers = {}, cookies = '', maxRedirects = 5 } = options;

  const scrapeOptions = {
    headers: {
      'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      ...headers
    },
    maxRedirects,
    timeout: 10000
  };

  if (cookies) {
    scrapeOptions.headers['Cookie'] = cookies;
  }

  return await axios.get(url, scrapeOptions);
}

3. Log Redirect Chains

function createRedirectLogger() {
  return {
    redirects: [],
    logRedirect(from, to, status) {
      this.redirects.push({ from, to, status, timestamp: new Date() });
      console.log(`Redirect: ${from} -> ${to} (${status})`);
    },
    getRedirectChain() {
      return this.redirects;
    }
  };
}

Security Considerations

  1. Validate redirect URLs to prevent server-side request forgery (SSRF)
  2. Set reasonable redirect limits to avoid infinite loops
  3. Be cautious with sensitive headers that might leak during redirects
  4. Check final URLs to ensure they match expected domains

Common Issues

  • CORS restrictions in browsers may hide redirect locations
  • Relative URLs in redirect headers need proper resolution
  • Authentication tokens may not persist across redirects
  • HTTPS to HTTP redirects may be blocked by browsers

Always respect robots.txt, rate limits, and website terms of service when implementing redirect handling in your web scraping projects.

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon