How do you extract all links from a webpage using Cheerio?

To extract all links from a webpage using Cheerio, you need to fetch the HTML content first, then use Cheerio's jQuery-like selectors to find anchor tags and extract their href attributes.

Installation

Install the required packages using npm:

npm install axios cheerio

Basic Link Extraction

Here's a simple function to extract all links from a webpage:

const axios = require('axios');
const cheerio = require('cheerio');

const extractLinks = async (url) => {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);

    const links = [];
    $('a[href]').each((index, element) => {
      const href = $(element).attr('href');
      if (href && href.trim()) {
        links.push(href);
      }
    });

    return links;
  } catch (error) {
    console.error('Error extracting links:', error.message);
    return [];
  }
};

// Usage
extractLinks('https://example.com')
  .then(links => console.log('Found links:', links));

Enhanced Link Extraction with Additional Data

Extract links along with their text content and other attributes:

const extractDetailedLinks = async (url) => {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);

    const links = [];
    $('a[href]').each((index, element) => {
      const $link = $(element);
      const href = $link.attr('href');

      if (href && href.trim()) {
        links.push({
          url: href,
          text: $link.text().trim(),
          title: $link.attr('title') || '',
          target: $link.attr('target') || ''
        });
      }
    });

    return links;
  } catch (error) {
    console.error('Error extracting detailed links:', error.message);
    return [];
  }
};

Handling Relative URLs

Convert relative URLs to absolute URLs using the URL constructor:

const { URL } = require('url');

const extractAbsoluteLinks = async (baseUrl) => {
  try {
    const response = await axios.get(baseUrl);
    const $ = cheerio.load(response.data);

    const links = [];
    $('a[href]').each((index, element) => {
      const href = $(element).attr('href');

      if (href && href.trim()) {
        try {
          // Convert relative URLs to absolute
          const absoluteUrl = new URL(href, baseUrl).toString();
          links.push(absoluteUrl);
        } catch (urlError) {
          // Skip invalid URLs
          console.warn(`Invalid URL: ${href}`);
        }
      }
    });

    return links;
  } catch (error) {
    console.error('Error extracting absolute links:', error.message);
    return [];
  }
};

Filtering Links by Type

Filter links based on different criteria:

const extractFilteredLinks = async (url) => {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);

    const result = {
      internal: [],
      external: [],
      email: [],
      phone: []
    };

    $('a[href]').each((index, element) => {
      const href = $(element).attr('href');

      if (!href || !href.trim()) return;

      if (href.startsWith('mailto:')) {
        result.email.push(href);
      } else if (href.startsWith('tel:')) {
        result.phone.push(href);
      } else if (href.startsWith('http') && !href.includes(new URL(url).hostname)) {
        result.external.push(href);
      } else {
        result.internal.push(href);
      }
    });

    return result;
  } catch (error) {
    console.error('Error filtering links:', error.message);
    return { internal: [], external: [], email: [], phone: [] };
  }
};

Complete Example with Error Handling

A production-ready function with comprehensive error handling:

const axios = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');

const scrapeLinks = async (targetUrl, options = {}) => {
  const {
    includeText = false,
    makeAbsolute = true,
    filterDuplicates = true,
    timeout = 10000
  } = options;

  try {
    const response = await axios.get(targetUrl, {
      timeout,
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; LinkExtractor/1.0)'
      }
    });

    const $ = cheerio.load(response.data);
    const links = new Set(); // Use Set to avoid duplicates
    const results = [];

    $('a[href]').each((index, element) => {
      const $element = $(element);
      const href = $element.attr('href');

      if (!href || !href.trim()) return;

      let processedUrl = href;

      // Convert to absolute URL if requested
      if (makeAbsolute) {
        try {
          processedUrl = new URL(href, targetUrl).toString();
        } catch (urlError) {
          console.warn(`Skipping invalid URL: ${href}`);
          return;
        }
      }

      // Skip duplicates if filtering is enabled
      if (filterDuplicates && links.has(processedUrl)) return;

      links.add(processedUrl);

      const linkData = { url: processedUrl };

      if (includeText) {
        linkData.text = $element.text().trim();
        linkData.title = $element.attr('title') || '';
      }

      results.push(linkData);
    });

    return results;
  } catch (error) {
    if (error.code === 'ENOTFOUND') {
      throw new Error(`Domain not found: ${targetUrl}`);
    } else if (error.code === 'ECONNREFUSED') {
      throw new Error(`Connection refused: ${targetUrl}`);
    } else if (error.response) {
      throw new Error(`HTTP ${error.response.status}: ${error.response.statusText}`);
    } else {
      throw new Error(`Failed to scrape links: ${error.message}`);
    }
  }
};

// Usage examples
(async () => {
  try {
    // Basic usage
    const basicLinks = await scrapeLinks('https://example.com');
    console.log('Basic links:', basicLinks);

    // With options
    const detailedLinks = await scrapeLinks('https://example.com', {
      includeText: true,
      makeAbsolute: true,
      filterDuplicates: true
    });
    console.log('Detailed links:', detailedLinks);

  } catch (error) {
    console.error('Scraping failed:', error.message);
  }
})();

Best Practices

  • Respect robots.txt: Always check the website's robots.txt file before scraping
  • Rate limiting: Add delays between requests to avoid overwhelming servers
  • User-Agent: Set a proper User-Agent header to identify your scraper
  • Error handling: Implement robust error handling for network issues and invalid URLs
  • URL validation: Always validate URLs before processing them
  • Duplicate filtering: Remove duplicate links to improve efficiency

Limitations

  • JavaScript-rendered content: Cheerio can't execute JavaScript, so it won't capture links added dynamically
  • Authentication: Doesn't handle login-protected pages automatically
  • CORS restrictions: May face CORS issues when running in browsers
  • Large pages: Memory usage can be high for very large HTML documents

For JavaScript-heavy websites, consider using Puppeteer or Playwright instead of Cheerio.

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon