How can I combine jsoup with other Java libraries for advanced scraping tasks?

While jsoup excels at HTML parsing and manipulation, combining it with other Java libraries creates powerful, enterprise-grade web scraping solutions. This guide explores proven integration patterns and demonstrates how to build robust scraping applications by leveraging jsoup alongside complementary libraries.

HTTP Client Libraries Integration

Apache HttpClient with jsoup

Apache HttpClient provides advanced HTTP features that complement jsoup's parsing capabilities:

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class HttpClientJsoupIntegration {

    public Document fetchAndParse(String url) throws Exception {
        try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
            HttpGet request = new HttpGet(url);
            request.addHeader("User-Agent", "Mozilla/5.0 (compatible; Bot/1.0)");
            request.addHeader("Accept", "text/html,application/xhtml+xml");

            try (CloseableHttpResponse response = httpClient.execute(request)) {
                String html = EntityUtils.toString(response.getEntity());
                return Jsoup.parse(html, url);
            }
        }
    }

    // Advanced connection pooling
    public CloseableHttpClient createPooledClient() {
        return HttpClients.custom()
            .setMaxConnTotal(100)
            .setMaxConnPerRoute(20)
            .build();
    }
}

OkHttp Integration

OkHttp offers modern HTTP/2 support and efficient connection management:

import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.concurrent.TimeUnit;

public class OkHttpJsoupScraper {
    private final OkHttpClient client;

    public OkHttpJsoupScraper() {
        this.client = new OkHttpClient.Builder()
            .connectTimeout(10, TimeUnit.SECONDS)
            .readTimeout(30, TimeUnit.SECONDS)
            .build();
    }

    public Document scrapeWithRetry(String url, int maxRetries) throws IOException {
        for (int attempt = 0; attempt < maxRetries; attempt++) {
            try {
                Request request = new Request.Builder()
                    .url(url)
                    .header("User-Agent", "ScrapingBot/1.0")
                    .build();

                try (Response response = client.newCall(request).execute()) {
                    if (response.isSuccessful()) {
                        String html = response.body().string();
                        return Jsoup.parse(html, url);
                    }
                }
            } catch (IOException e) {
                if (attempt == maxRetries - 1) throw e;
                Thread.sleep(1000 * (attempt + 1)); // Exponential backoff
            }
        }
        throw new IOException("Failed after " + maxRetries + " attempts");
    }
}

Database Integration Patterns

JPA/Hibernate Integration

Store scraped data efficiently using JPA entities:

import javax.persistence.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.math.BigDecimal;
import java.time.LocalDateTime;
import java.util.List;
import java.util.stream.Collectors;

@Entity
@Table(name = "scraped_products")
public class Product {
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;

    @Column(length = 500)
    private String title;

    @Column(precision = 10, scale = 2)
    private BigDecimal price;

    private String description;
    private String imageUrl;
    private LocalDateTime scrapedAt;

    // Constructors, getters, setters
}

@Service
public class ProductScrapingService {

    @Autowired
    private ProductRepository productRepository;

    public List<Product> scrapeAndSaveProducts(String url) throws IOException {
        Document doc = Jsoup.connect(url).get();
        Elements productElements = doc.select(".product-item");

        List<Product> products = productElements.stream()
            .map(this::parseProduct)
            .collect(Collectors.toList());

        return productRepository.saveAll(products);
    }

    private Product parseProduct(Element element) {
        Product product = new Product();
        product.setTitle(element.select(".product-title").text());
        product.setPrice(parsePrice(element.select(".price").text()));
        product.setDescription(element.select(".description").text());
        product.setImageUrl(element.select("img").attr("abs:src"));
        product.setScrapedAt(LocalDateTime.now());
        return product;
    }

    private BigDecimal parsePrice(String priceText) {
        // Extract numeric value from price text
        String cleanPrice = priceText.replaceAll("[^\\d.]", "");
        return new BigDecimal(cleanPrice);
    }
}

MongoDB Integration with Spring Data

For flexible document storage:

import org.springframework.data.mongodb.core.mapping.Document;
import org.springframework.data.mongodb.repository.MongoRepository;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.HashMap;
import java.util.Map;

@Document(collection = "web_pages")
public class ScrapedPage {
    private String id;
    private String url;
    private String title;
    private Map<String, Object> extractedData;
    private LocalDateTime timestamp;

    // Constructor to create from jsoup Document
    public ScrapedPage(org.jsoup.nodes.Document doc, String sourceUrl) {
        this.url = sourceUrl;
        this.title = doc.title();
        this.extractedData = extractStructuredData(doc);
        this.timestamp = LocalDateTime.now();
    }

    private Map<String, Object> extractStructuredData(org.jsoup.nodes.Document doc) {
        Map<String, Object> data = new HashMap<>();
        ObjectMapper objectMapper = new ObjectMapper();

        // Extract meta tags
        doc.select("meta[property^=og:]").forEach(meta -> {
            String property = meta.attr("property").substring(3); // Remove "og:"
            data.put(property, meta.attr("content"));
        });

        // Extract JSON-LD structured data
        doc.select("script[type=application/ld+json]").forEach(script -> {
            try {
                JsonNode jsonNode = objectMapper.readTree(script.html());
                data.put("structuredData", jsonNode);
            } catch (Exception e) {
                // Log parsing error
            }
        });

        return data;
    }
}

JSON Processing Integration

Jackson Integration for API Responses

Combine jsoup with Jackson for comprehensive data extraction:

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.List;
import java.util.stream.Collectors;

public class HybridScraper {
    private final ObjectMapper objectMapper = new ObjectMapper();

    public CombinedData scrapePageWithApi(String pageUrl, String apiUrl) throws Exception {
        // Scrape HTML content with jsoup
        Document doc = Jsoup.connect(pageUrl).get();
        PageData pageData = extractPageData(doc);

        // Fetch related API data
        String apiResponse = fetchApiData(apiUrl);
        ApiData apiData = objectMapper.readValue(apiResponse, ApiData.class);

        return new CombinedData(pageData, apiData);
    }

    private PageData extractPageData(Document doc) {
        return PageData.builder()
            .title(doc.title())
            .description(doc.select("meta[name=description]").attr("content"))
            .links(doc.select("a[href]").stream()
                .map(link -> link.attr("abs:href"))
                .collect(Collectors.toList()))
            .build();
    }

    // Extract JSON data embedded in HTML
    public JsonNode extractEmbeddedJson(Document doc, String selector) throws Exception {
        Element jsonElement = doc.selectFirst(selector);
        if (jsonElement != null) {
            return objectMapper.readTree(jsonElement.html());
        }
        return null;
    }

    private String fetchApiData(String apiUrl) throws IOException {
        // Implementation to fetch API data
        return Jsoup.connect(apiUrl)
            .header("Accept", "application/json")
            .ignoreContentType(true)
            .execute()
            .body();
    }
}

Concurrent Processing Libraries

CompletableFuture for Parallel Scraping

Scale your scraping operations with concurrent processing:

import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.List;
import java.util.stream.Collectors;

public class ConcurrentScraper {
    private final ExecutorService executor = Executors.newFixedThreadPool(10);

    public CompletableFuture<List<ScrapedData>> scrapeUrlsConcurrently(List<String> urls) {
        List<CompletableFuture<ScrapedData>> futures = urls.stream()
            .map(url -> CompletableFuture.supplyAsync(() -> scrapeUrl(url), executor))
            .collect(Collectors.toList());

        return CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))
            .thenApply(v -> futures.stream()
                .map(CompletableFuture::join)
                .collect(Collectors.toList()));
    }

    private ScrapedData scrapeUrl(String url) {
        try {
            Document doc = Jsoup.connect(url)
                .timeout(10000)
                .userAgent("Mozilla/5.0 (compatible; Bot/1.0)")
                .get();

            return new ScrapedData(
                url,
                doc.title(),
                doc.select("p").text(),
                System.currentTimeMillis()
            );
        } catch (Exception e) {
            return new ScrapedData(url, null, "Error: " + e.getMessage(), System.currentTimeMillis());
        }
    }

    public void shutdown() {
        executor.shutdown();
    }
}

class ScrapedData {
    private final String url;
    private final String title;
    private final String content;
    private final long timestamp;

    public ScrapedData(String url, String title, String content, long timestamp) {
        this.url = url;
        this.title = title;
        this.content = content;
        this.timestamp = timestamp;
    }

    // Getters
}

Testing Framework Integration

JUnit 5 with WireMock

Create robust tests for your scraping applications:

import com.github.tomakehurst.wiremock.WireMockServer;
import com.github.tomakehurst.wiremock.client.WireMock;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.assertThat;
import java.math.BigDecimal;

class JsoupScrapingTest {
    private WireMockServer wireMockServer;

    @BeforeEach
    void setUp() {
        wireMockServer = new WireMockServer(8089);
        wireMockServer.start();
        WireMock.configureFor("localhost", 8089);
    }

    @Test
    void testProductScraping() throws Exception {
        // Mock HTML response
        String mockHtml = """
            <html>
                <body>
                    <div class="product">
                        <h2 class="title">Test Product</h2>
                        <span class="price">$29.99</span>
                        <p class="description">Product description</p>
                    </div>
                </body>
            </html>
            """;

        WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/products"))
            .willReturn(WireMock.aResponse()
                .withStatus(200)
                .withHeader("Content-Type", "text/html")
                .withBody(mockHtml)));

        // Test scraping
        ProductScraper scraper = new ProductScraper();
        Product product = scraper.scrapeProduct("http://localhost:8089/products");

        assertThat(product.getTitle()).isEqualTo("Test Product");
        assertThat(product.getPrice()).isEqualTo(new BigDecimal("29.99"));
    }

    @AfterEach
    void tearDown() {
        wireMockServer.stop();
    }
}

Configuration and Dependency Management

Maven Dependencies

Include these dependencies in your pom.xml:

<dependencies>
    <!-- Core jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.17.2</version>
    </dependency>

    <!-- HTTP Clients -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.14</version>
    </dependency>

    <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp</artifactId>
        <version>4.12.0</version>
    </dependency>

    <!-- JSON Processing -->
    <dependency>
        <groupId>com.fasterxml.jackson.core</groupId>
        <artifactId>jackson-databind</artifactId>
        <version>2.16.1</version>
    </dependency>

    <!-- Database -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-data-jpa</artifactId>
    </dependency>

    <!-- Testing -->
    <dependency>
        <groupId>com.github.tomakehurst</groupId>
        <artifactId>wiremock-jre8</artifactId>
        <version>2.35.0</version>
        <scope>test</scope>
    </dependency>
</dependencies>

Best Practices for Library Integration

Error Handling and Resilience

Implement comprehensive error handling across library boundaries:

import org.springframework.retry.annotation.Retryable;
import org.springframework.retry.annotation.Backoff;

public class ResilientScraper {

    @Retryable(value = {IOException.class}, maxAttempts = 3, backoff = @Backoff(delay = 1000))
    public Optional<Document> scrapeWithResilience(String url) {
        try {
            return Optional.of(Jsoup.connect(url)
                .timeout(5000)
                .get());
        } catch (IOException e) {
            throw new ScrapingException("Failed to scrape: " + url, e);
        }
    }
}

class ScrapingException extends RuntimeException {
    public ScrapingException(String message, Throwable cause) {
        super(message, cause);
    }
}

Resource Management

Properly manage resources across different libraries:

public class ResourceManagedScraper implements AutoCloseable {
    private final CloseableHttpClient httpClient;
    private final ExecutorService executorService;

    public ResourceManagedScraper() {
        this.httpClient = HttpClients.createDefault();
        this.executorService = Executors.newFixedThreadPool(5);
    }

    public List<Document> scrapeMultipleUrls(List<String> urls) throws Exception {
        return urls.parallelStream()
            .map(this::safeConnect)
            .filter(Optional::isPresent)
            .map(Optional::get)
            .collect(Collectors.toList());
    }

    private Optional<Document> safeConnect(String url) {
        try {
            return Optional.of(Jsoup.connect(url).get());
        } catch (IOException e) {
            return Optional.empty();
        }
    }

    @Override
    public void close() throws Exception {
        httpClient.close();
        executorService.shutdown();
    }
}

Advanced Integration Patterns

Factory Pattern for Multi-Library Coordination

Create flexible scraping strategies:

public interface ScrapingStrategy {
    ScrapedData scrape(String url) throws ScrapingException;
}

public class ScrapingStrategyFactory {
    public ScrapingStrategy createStrategy(ScrapingConfig config) {
        return switch (config.getType()) {
            case SIMPLE -> new SimpleJsoupStrategy();
            case HTTP_CLIENT -> new HttpClientJsoupStrategy(config);
            case CONCURRENT -> new ConcurrentScrapingStrategy(config);
            case DATABASE_BACKED -> new DatabaseBackedStrategy(config);
            default -> throw new IllegalArgumentException("Unknown strategy type");
        };
    }
}

class SimpleJsoupStrategy implements ScrapingStrategy {
    @Override
    public ScrapedData scrape(String url) throws ScrapingException {
        try {
            Document doc = Jsoup.connect(url).get();
            return new ScrapedData(url, doc.title(), doc.text(), System.currentTimeMillis());
        } catch (IOException e) {
            throw new ScrapingException("Failed to scrape " + url, e);
        }
    }
}

Rate Limiting Integration

Combine with rate limiting libraries for responsible scraping:

import io.github.bucket4j.Bucket;
import io.github.bucket4j.Bandwidth;
import java.time.Duration;

public class RateLimitedScraper {
    private final Bucket bucket;

    public RateLimitedScraper(int requestsPerMinute) {
        Bandwidth limit = Bandwidth.classic(requestsPerMinute, Duration.ofMinutes(1));
        this.bucket = Bucket.builder()
            .addLimit(limit)
            .build();
    }

    public Document scrapeWithRateLimit(String url) throws Exception {
        bucket.asBlocking().consume(1); // Wait if necessary
        return Jsoup.connect(url).get();
    }
}

By combining jsoup with complementary Java libraries, you can build sophisticated web scraping solutions that handle complex scenarios, scale efficiently, and maintain data integrity. The key is understanding each library's strengths and designing clean integration points that leverage their combined capabilities.

When dealing with modern web applications that heavily rely on JavaScript, consider integrating browser automation tools that handle dynamic content alongside these Java-based approaches for comprehensive coverage of both static and dynamic web scraping scenarios.

Table of contents