Is there a way to extract all links from a webpage using jsoup?

Yes, Jsoup provides an excellent way to extract all links from a webpage. As a powerful Java HTML parser, Jsoup offers simple and efficient methods to select and extract hyperlinks using CSS selectors or DOM traversal.

Basic Link Extraction

Here's a simple example to extract all links from a webpage:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class LinkExtractor {
    public static void main(String[] args) {
        try {
            String url = "https://example.com";

            // Fetch and parse the webpage
            Document document = Jsoup.connect(url).get();

            // Select all anchor tags with href attribute
            Elements links = document.select("a[href]");

            // Extract and print all links
            for (Element link : links) {
                String href = link.attr("abs:href");
                String text = link.text();
                System.out.println("URL: " + href + " | Text: " + text);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

Advanced Link Extraction with Configuration

For production use, you should configure connection settings and handle different scenarios:

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class AdvancedLinkExtractor {

    public static List<LinkInfo> extractLinks(String url) throws IOException {
        List<LinkInfo> linksList = new ArrayList<>();

        // Configure connection with proper settings
        Document document = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
                .timeout(10000)
                .followRedirects(true)
                .get();

        Elements links = document.select("a[href]");

        for (Element link : links) {
            String href = link.attr("abs:href");
            String text = link.text().trim();
            String title = link.attr("title");

            // Skip empty or invalid links
            if (!href.isEmpty() && !href.equals(url)) {
                linksList.add(new LinkInfo(href, text, title));
            }
        }

        return linksList;
    }

    // Helper class to store link information
    static class LinkInfo {
        String url;
        String text;
        String title;

        LinkInfo(String url, String text, String title) {
            this.url = url;
            this.text = text;
            this.title = title;
        }

        @Override
        public String toString() {
            return String.format("URL: %s | Text: %s | Title: %s", url, text, title);
        }
    }
}

Filtering Links by Type

You can extract specific types of links using more targeted CSS selectors:

// Extract only external links
Elements externalLinks = document.select("a[href^=http]:not([href*=" + 
    document.location().getHost() + "])");

// Extract only internal links
Elements internalLinks = document.select("a[href^=/], a[href^=" + 
    document.baseUri() + "]");

// Extract email links
Elements emailLinks = document.select("a[href^=mailto:]");

// Extract download links (common file types)
Elements downloadLinks = document.select("a[href$=.pdf], a[href$=.doc], " +
    "a[href$=.docx], a[href$=.zip], a[href$=.exe]");

// Extract links with specific classes
Elements navLinks = document.select("a.nav-link[href]");

Working with Local HTML Files

You can also extract links from local HTML files:

import java.io.File;

public void extractFromFile(String filePath) throws IOException {
    File htmlFile = new File(filePath);
    Document document = Jsoup.parse(htmlFile, "UTF-8");

    Elements links = document.select("a[href]");
    for (Element link : links) {
        // For local files, you might want relative URLs
        String href = link.attr("href");
        System.out.println("Link: " + href);
    }
}

Error Handling and Best Practices

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;

public class RobustLinkExtractor {

    public static void extractLinksWithErrorHandling(String url) {
        try {
            Document document = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (compatible; LinkExtractor/1.0)")
                    .timeout(15000)
                    .maxBodySize(1024 * 1024) // 1MB limit
                    .get();

            Elements links = document.select("a[href]");
            System.out.println("Found " + links.size() + " links");

            for (Element link : links) {
                String href = link.attr("abs:href");
                if (isValidUrl(href)) {
                    System.out.println(href);
                }
            }

        } catch (HttpStatusException e) {
            System.err.println("HTTP error: " + e.getStatusCode() + " - " + e.getMessage());
        } catch (UnsupportedMimeTypeException e) {
            System.err.println("Unsupported content type: " + e.getMimeType());
        } catch (IOException e) {
            System.err.println("Connection error: " + e.getMessage());
        }
    }

    private static boolean isValidUrl(String url) {
        return url != null && !url.isEmpty() && 
               (url.startsWith("http://") || url.startsWith("https://"));
    }
}

Key Points to Remember

  • Use abs:href to get absolute URLs instead of relative ones
  • Configure timeouts to prevent hanging connections
  • Set a user agent to avoid being blocked by websites
  • Handle exceptions properly for robust applications
  • Respect robots.txt and website terms of service
  • Add delays between requests when scraping multiple pages
  • Filter invalid links to avoid processing empty or malformed URLs

The attr("abs:href") method is particularly useful as it automatically resolves relative URLs to absolute ones based on the document's base URI, ensuring you get complete, usable links.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon