Handling timeouts and retries with jsoup is crucial for building robust web scraping applications. This guide covers how to configure timeouts, implement retry mechanisms, and handle connection failures gracefully.
Understanding Timeouts in jsoup
Jsoup provides several timeout configurations to control connection behavior:
- Connection timeout: Time to establish connection
- Read timeout: Time to read data from server
- Total timeout: Overall request duration limit
Basic Timeout Configuration
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.SocketTimeoutException;
import java.io.IOException;
public class JsoupTimeoutExample {
public static void main(String[] args) {
try {
Document doc = Jsoup.connect("https://example.com")
.timeout(10000) // Total timeout: 10 seconds
.userAgent("Mozilla/5.0") // Always set user agent
.get();
System.out.println("Title: " + doc.title());
} catch (SocketTimeoutException e) {
System.err.println("Request timed out: " + e.getMessage());
} catch (IOException e) {
System.err.println("Connection failed: " + e.getMessage());
}
}
}
Advanced Timeout Settings
Document doc = Jsoup.connect("https://example.com")
.timeout(0) // No timeout (infinite)
.maxBodySize(1024 * 1024) // 1MB max response size
.followRedirects(true) // Follow redirects
.ignoreHttpErrors(true) // Don't throw on HTTP errors
.get();
Implementing Retry Logic
Simple Retry with Linear Backoff
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public class SimpleRetryExample {
private static final int MAX_RETRIES = 3;
private static final int RETRY_DELAY = 2000; // 2 seconds
public static Document fetchWithRetry(String url) throws IOException {
IOException lastException = null;
for (int attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
return Jsoup.connect(url)
.timeout(10000)
.userAgent("Mozilla/5.0 (compatible; JavaBot/1.0)")
.get();
} catch (IOException e) {
lastException = e;
System.out.printf("Attempt %d failed: %s%n", attempt, e.getMessage());
if (attempt < MAX_RETRIES) {
try {
Thread.sleep(RETRY_DELAY);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new IOException("Retry interrupted", ie);
}
}
}
}
throw new IOException("Failed after " + MAX_RETRIES + " attempts", lastException);
}
}
Advanced Retry with Exponential Backoff
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.ConnectException;
import java.util.Random;
public class ExponentialBackoffRetry {
private static final int MAX_RETRIES = 5;
private static final long BASE_DELAY = 1000; // 1 second
private static final Random random = new Random();
public static Document fetchWithExponentialBackoff(String url) throws IOException {
IOException lastException = null;
for (int attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
return Jsoup.connect(url)
.timeout(15000)
.userAgent("Mozilla/5.0 (compatible; JavaBot/1.0)")
.header("Accept", "text/html,application/xhtml+xml")
.get();
} catch (SocketTimeoutException | ConnectException e) {
lastException = e;
System.out.printf("Attempt %d failed (%s): %s%n",
attempt, e.getClass().getSimpleName(), e.getMessage());
if (attempt < MAX_RETRIES) {
long delay = calculateBackoffDelay(attempt);
System.out.printf("Retrying in %d ms...%n", delay);
try {
Thread.sleep(delay);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new IOException("Retry interrupted", ie);
}
}
} catch (IOException e) {
// Don't retry on other IO errors (404, 403, etc.)
throw e;
}
}
throw new IOException("Failed after " + MAX_RETRIES + " attempts", lastException);
}
private static long calculateBackoffDelay(int attempt) {
// Exponential backoff with jitter: 2^attempt * base + random(0-1000)
long exponentialDelay = (long) (BASE_DELAY * Math.pow(2, attempt - 1));
long jitter = random.nextInt(1000);
return exponentialDelay + jitter;
}
}
Production-Ready Retry Service
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.ConnectException;
import java.net.UnknownHostException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
public class RobustJsoupClient {
private static final int MAX_RETRIES = 3;
private static final long BASE_DELAY_MS = 1000;
private static final int TIMEOUT_MS = 15000;
// HTTP status codes that should trigger retries
private static final Set<Integer> RETRYABLE_STATUS_CODES = Set.of(
408, 429, 500, 502, 503, 504
);
// Exception types that should trigger retries
private static final Set<Class<? extends Exception>> RETRYABLE_EXCEPTIONS = Set.of(
SocketTimeoutException.class,
ConnectException.class,
UnknownHostException.class
);
public Document fetch(String url) throws IOException {
return fetch(url, MAX_RETRIES);
}
public Document fetch(String url, int maxRetries) throws IOException {
IOException lastException = null;
for (int attempt = 1; attempt <= maxRetries; attempt++) {
try {
Connection.Response response = Jsoup.connect(url)
.timeout(TIMEOUT_MS)
.userAgent("Mozilla/5.0 (compatible; JavaBot/1.0)")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Language", "en-US,en;q=0.5")
.header("Accept-Encoding", "gzip, deflate")
.header("Connection", "keep-alive")
.execute();
// Check if status code is retryable
int statusCode = response.statusCode();
if (RETRYABLE_STATUS_CODES.contains(statusCode)) {
throw new IOException("Retryable HTTP status: " + statusCode);
}
return response.parse();
} catch (IOException e) {
lastException = e;
boolean shouldRetry = shouldRetryException(e) && attempt < maxRetries;
System.out.printf("Attempt %d failed: %s%s%n",
attempt, e.getMessage(),
shouldRetry ? " - retrying..." : " - giving up");
if (shouldRetry) {
waitBeforeRetry(attempt);
} else {
break;
}
}
}
throw new IOException(
String.format("Failed to fetch %s after %d attempts", url, maxRetries),
lastException
);
}
private boolean shouldRetryException(IOException e) {
return RETRYABLE_EXCEPTIONS.stream()
.anyMatch(retryableClass -> retryableClass.isInstance(e));
}
private void waitBeforeRetry(int attempt) {
try {
long delay = BASE_DELAY_MS * (1L << (attempt - 1)); // 2^(attempt-1)
TimeUnit.MILLISECONDS.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("Retry interrupted", e);
}
}
}
Usage Examples
public class Main {
public static void main(String[] args) {
RobustJsoupClient client = new RobustJsoupClient();
try {
Document doc = client.fetch("https://example.com");
System.out.println("Successfully fetched: " + doc.title());
// Extract data
doc.select("h1").forEach(h1 ->
System.out.println("Heading: " + h1.text())
);
} catch (IOException e) {
System.err.println("Failed to fetch page: " + e.getMessage());
}
}
}
Best Practices
- Set appropriate timeouts: 10-30 seconds for most use cases
- Use exponential backoff: Prevents overwhelming servers
- Add jitter: Reduces thundering herd problems
- Limit retry attempts: 3-5 retries maximum
- Set User-Agent: Many sites block requests without proper headers
- Handle specific exceptions: Only retry transient failures
- Respect rate limits: Add delays between requests
- Monitor retry rates: High retry rates indicate systemic issues
Common Timeout Values
- Fast APIs: 5-10 seconds
- Regular websites: 15-30 seconds
- Slow/heavy pages: 30-60 seconds
- File downloads: No timeout or very high (300+ seconds)
Remember to adjust timeout and retry settings based on your specific use case and the characteristics of the websites you're scraping.