How can I select elements using CSS selectors in jsoup?

Jsoup is a powerful Java library for working with real-world HTML that provides a convenient API for extracting and manipulating data using DOM, CSS, and jQuery-like methods. One of its most powerful features is CSS selector support through the select() method.

Getting Started with jsoup

1. Add jsoup to Your Project

Maven (pom.xml):

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.17.2</version>
</dependency>

Gradle (build.gradle):

implementation 'org.jsoup:jsoup:1.17.2'

2. Parse HTML Content

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

// Parse HTML string
String html = "<html><body><div class='content'><p>Hello World</p></div></body></html>";
Document doc = Jsoup.parse(html);

// Load from URL
Document doc = Jsoup.connect("https://example.com")
    .userAgent("Mozilla/5.0")
    .timeout(5000)
    .get();

// Parse from file
Document doc = Jsoup.parse(new File("example.html"), "UTF-8");

Basic CSS Selectors

Tag Selectors

// Select all paragraphs
Elements paragraphs = doc.select("p");

// Select all images
Elements images = doc.select("img");

// Select all form inputs
Elements inputs = doc.select("input");

Class Selectors

// Select elements with specific class
Elements content = doc.select(".content");
Elements highlights = doc.select(".highlight");

// Multiple classes (elements with both classes)
Elements items = doc.select(".item.active");

ID Selectors

// Select element by ID
Element header = doc.select("#header").first();
Element mainContent = doc.select("#main-content").first();

// Safe ID selection with null check
Element nav = doc.select("#navigation").first();
if (nav != null) {
    System.out.println("Navigation found: " + nav.text());
}

Attribute Selectors

// Elements with specific attribute
Elements links = doc.select("a[href]");
Elements required = doc.select("input[required]");

// Attribute with specific value
Elements externalLinks = doc.select("a[target=_blank]");
Elements emailInputs = doc.select("input[type=email]");

// Attribute contains value
Elements images = doc.select("img[src*='.jpg']");
Elements socialLinks = doc.select("a[href*='facebook.com']");

// Attribute starts with value
Elements httpsLinks = doc.select("a[href^='https://']");

// Attribute ends with value
Elements pdfLinks = doc.select("a[href$='.pdf']");

Advanced CSS Selectors

Hierarchical Selectors

// Descendant selector (any level)
Elements articleParagraphs = doc.select("article p");

// Direct child selector
Elements directChildren = doc.select("ul > li");

// Adjacent sibling selector
Elements nextElements = doc.select("h2 + p");

// General sibling selector
Elements siblings = doc.select("h2 ~ p");

Pseudo-selectors

// First and last elements
Element firstItem = doc.select("li:first-child").first();
Element lastItem = doc.select("li:last-child").first();

// Nth elements
Elements evenRows = doc.select("tr:nth-child(even)");
Elements oddRows = doc.select("tr:nth-child(odd)");
Element thirdItem = doc.select("li:nth-child(3)").first();

// Elements containing text
Elements newsItems = doc.select("div:contains(news)");
Elements matchingText = doc.select("p:containsOwn(exact text)");

// Empty elements
Elements emptyDivs = doc.select("div:empty");

Practical Examples

Web Scraping Example

public class WebScrapingExample {
    public static void main(String[] args) throws IOException {
        Document doc = Jsoup.connect("https://news.ycombinator.com")
            .userAgent("Mozilla/5.0")
            .get();

        // Extract news titles
        Elements titles = doc.select("a.storylink");
        for (Element title : titles) {
            System.out.println("Title: " + title.text());
            System.out.println("URL: " + title.attr("href"));
        }

        // Extract user scores
        Elements scores = doc.select("span.score");
        for (Element score : scores) {
            System.out.println("Score: " + score.text());
        }
    }
}

Form Data Extraction

public void extractFormData(Document doc) {
    // Get all form fields
    Elements forms = doc.select("form");

    for (Element form : forms) {
        System.out.println("Form action: " + form.attr("action"));

        // Get input fields
        Elements inputs = form.select("input");
        for (Element input : inputs) {
            String type = input.attr("type");
            String name = input.attr("name");
            String value = input.attr("value");

            System.out.printf("Input - Type: %s, Name: %s, Value: %s%n", 
                            type, name, value);
        }

        // Get select options
        Elements selects = form.select("select");
        for (Element select : selects) {
            Elements options = select.select("option");
            System.out.println("Select options:");
            for (Element option : options) {
                System.out.println("  " + option.text() + " = " + option.attr("value"));
            }
        }
    }
}

Table Data Extraction

public void extractTableData(Document doc) {
    Elements tables = doc.select("table");

    for (Element table : tables) {
        // Extract headers
        Elements headers = table.select("thead tr th");
        System.out.println("Headers:");
        for (Element header : headers) {
            System.out.print(header.text() + "\t");
        }
        System.out.println();

        // Extract rows
        Elements rows = table.select("tbody tr");
        for (Element row : rows) {
            Elements cells = row.select("td");
            for (Element cell : cells) {
                System.out.print(cell.text() + "\t");
            }
            System.out.println();
        }
    }
}

Best Practices and Error Handling

Safe Element Access

// Always check for null when expecting single elements
Element element = doc.select("#myid").first();
if (element != null) {
    String text = element.text();
    String href = element.attr("href");
}

// Use isEmpty() for collections
Elements elements = doc.select(".myclass");
if (!elements.isEmpty()) {
    for (Element el : elements) {
        // Process elements
    }
}

// Get text with fallback
String title = doc.select("title").text();
if (title.isEmpty()) {
    title = "No title found";
}

Combining Selectors

// Complex selector combinations
Elements items = doc.select("div.article:has(img):contains(breaking news)");
Elements links = doc.select("a[href]:not([href^='mailto:'])");
Elements validInputs = doc.select("input[type=text]:not([disabled])");

Performance Tips

// Cache frequently used selections
Elements navigationLinks = doc.select("nav a");

// Use specific selectors to improve performance
Elements specificElements = doc.select("div.content > p.highlight");
// Better than: doc.select("p.highlight");

// Limit search scope when possible
Element contentDiv = doc.select("#content").first();
if (contentDiv != null) {
    Elements internalLinks = contentDiv.select("a");
}

CSS Selector Reference

| Selector | Description | Example | |----------|-------------|---------| | tag | Tag name | div, p, a | | .class | Class name | .content, .highlight | | #id | Element ID | #header, #main | | [attr] | Has attribute | [href], [required] | | [attr=value] | Attribute equals | [type=text] | | [attr*=value] | Attribute contains | [href*=github] | | [attr^=value] | Attribute starts with | [href^=https] | | [attr$=value] | Attribute ends with | [href$=.pdf] | | parent child | Descendant | div p | | parent > child | Direct child | ul > li | | prev + next | Adjacent sibling | h2 + p | | prev ~ sibling | General sibling | h2 ~ p | | :first-child | First child | li:first-child | | :last-child | Last child | li:last-child | | :nth-child(n) | Nth child | :nth-child(2) | | :contains(text) | Contains text | :contains(news) | | :empty | Empty element | div:empty |

The select() method returns an Elements collection that you can iterate over or use methods like first(), last(), get(index), or isEmpty() to access specific elements safely.

Table of contents