How do you extract all links from a webpage using Cheerio?

To extract all links from a webpage using Cheerio, you need to fetch the HTML content first, then use Cheerio's jQuery-like selectors to find anchor tags and extract their href attributes.

Installation

Install the required packages using npm:

npm install axios cheerio

Basic Link Extraction

Here's a simple function to extract all links from a webpage:

const axios = require('axios');
const cheerio = require('cheerio');

const extractLinks = async (url) => {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);

    const links = [];
    $('a[href]').each((index, element) => {
      const href = $(element).attr('href');
      if (href && href.trim()) {
        links.push(href);
      }
    });

    return links;
  } catch (error) {
    console.error('Error extracting links:', error.message);
    return [];
  }
};

// Usage
extractLinks('https://example.com')
  .then(links => console.log('Found links:', links));

Enhanced Link Extraction with Additional Data

Extract links along with their text content and other attributes:

const extractDetailedLinks = async (url) => {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);

    const links = [];
    $('a[href]').each((index, element) => {
      const $link = $(element);
      const href = $link.attr('href');

      if (href && href.trim()) {
        links.push({
          url: href,
          text: $link.text().trim(),
          title: $link.attr('title') || '',
          target: $link.attr('target') || ''
        });
      }
    });

    return links;
  } catch (error) {
    console.error('Error extracting detailed links:', error.message);
    return [];
  }
};

Handling Relative URLs

Convert relative URLs to absolute URLs using the URL constructor:

const { URL } = require('url');

const extractAbsoluteLinks = async (baseUrl) => {
  try {
    const response = await axios.get(baseUrl);
    const $ = cheerio.load(response.data);

    const links = [];
    $('a[href]').each((index, element) => {
      const href = $(element).attr('href');

      if (href && href.trim()) {
        try {
          // Convert relative URLs to absolute
          const absoluteUrl = new URL(href, baseUrl).toString();
          links.push(absoluteUrl);
        } catch (urlError) {
          // Skip invalid URLs
          console.warn(`Invalid URL: ${href}`);
        }
      }
    });

    return links;
  } catch (error) {
    console.error('Error extracting absolute links:', error.message);
    return [];
  }
};

Filtering Links by Type

Filter links based on different criteria:

const extractFilteredLinks = async (url) => {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);

    const result = {
      internal: [],
      external: [],
      email: [],
      phone: []
    };

    $('a[href]').each((index, element) => {
      const href = $(element).attr('href');

      if (!href || !href.trim()) return;

      if (href.startsWith('mailto:')) {
        result.email.push(href);
      } else if (href.startsWith('tel:')) {
        result.phone.push(href);
      } else if (href.startsWith('http') && !href.includes(new URL(url).hostname)) {
        result.external.push(href);
      } else {
        result.internal.push(href);
      }
    });

    return result;
  } catch (error) {
    console.error('Error filtering links:', error.message);
    return { internal: [], external: [], email: [], phone: [] };
  }
};

Complete Example with Error Handling

A production-ready function with comprehensive error handling:

const axios = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');

const scrapeLinks = async (targetUrl, options = {}) => {
  const {
    includeText = false,
    makeAbsolute = true,
    filterDuplicates = true,
    timeout = 10000
  } = options;

  try {
    const response = await axios.get(targetUrl, {
      timeout,
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; LinkExtractor/1.0)'
      }
    });

    const $ = cheerio.load(response.data);
    const links = new Set(); // Use Set to avoid duplicates
    const results = [];

    $('a[href]').each((index, element) => {
      const $element = $(element);
      const href = $element.attr('href');

      if (!href || !href.trim()) return;

      let processedUrl = href;

      // Convert to absolute URL if requested
      if (makeAbsolute) {
        try {
          processedUrl = new URL(href, targetUrl).toString();
        } catch (urlError) {
          console.warn(`Skipping invalid URL: ${href}`);
          return;
        }
      }

      // Skip duplicates if filtering is enabled
      if (filterDuplicates && links.has(processedUrl)) return;

      links.add(processedUrl);

      const linkData = { url: processedUrl };

      if (includeText) {
        linkData.text = $element.text().trim();
        linkData.title = $element.attr('title') || '';
      }

      results.push(linkData);
    });

    return results;
  } catch (error) {
    if (error.code === 'ENOTFOUND') {
      throw new Error(`Domain not found: ${targetUrl}`);
    } else if (error.code === 'ECONNREFUSED') {
      throw new Error(`Connection refused: ${targetUrl}`);
    } else if (error.response) {
      throw new Error(`HTTP ${error.response.status}: ${error.response.statusText}`);
    } else {
      throw new Error(`Failed to scrape links: ${error.message}`);
    }
  }
};

// Usage examples
(async () => {
  try {
    // Basic usage
    const basicLinks = await scrapeLinks('https://example.com');
    console.log('Basic links:', basicLinks);

    // With options
    const detailedLinks = await scrapeLinks('https://example.com', {
      includeText: true,
      makeAbsolute: true,
      filterDuplicates: true
    });
    console.log('Detailed links:', detailedLinks);

  } catch (error) {
    console.error('Scraping failed:', error.message);
  }
})();

Best Practices

Respect robots.txt: Always check the website's robots.txt file before scraping
Rate limiting: Add delays between requests to avoid overwhelming servers
User-Agent: Set a proper User-Agent header to identify your scraper
Error handling: Implement robust error handling for network issues and invalid URLs
URL validation: Always validate URLs before processing them
Duplicate filtering: Remove duplicate links to improve efficiency

Limitations

JavaScript-rendered content: Cheerio can't execute JavaScript, so it won't capture links added dynamically
Authentication: Doesn't handle login-protected pages automatically
CORS restrictions: May face CORS issues when running in browsers
Large pages: Memory usage can be high for very large HTML documents

For JavaScript-heavy websites, consider using Puppeteer or Playwright instead of Cheerio.

Table of contents

How do you extract all links from a webpage using Cheerio?

Installation

Basic Link Extraction

Enhanced Link Extraction with Additional Data

Handling Relative URLs

Filtering Links by Type

Complete Example with Error Handling

Best Practices

Limitations

Try WebScraping.AI for Your Web Scraping Needs

Key Features:

Getting Started:

Get Started Now