Character encoding is crucial for web scraping to ensure text is extracted correctly without garbled characters. jsoup automatically detects encoding from HTTP headers and HTML meta tags, but sometimes manual intervention is needed for accurate data extraction.
How jsoup Detects Character Encoding
jsoup uses this priority order to determine encoding:
- HTTP Content-Type header (highest priority)
- HTML meta charset tag (
<meta charset="UTF-8">
) - HTML http-equiv meta tag (
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
) - UTF-8 fallback (if none specified)
Specifying Character Encoding When Parsing
Parsing HTML Strings with Specific Encoding
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class EncodingExample {
public static void main(String[] args) {
// HTML with special characters
String html = "<html><head><title>Café Müller</title></head>"
+ "<body><p>Price: 15€</p></body></html>";
// Parse with specific encoding
Document doc = Jsoup.parse(html, "UTF-8");
System.out.println("Title: " + doc.title());
System.out.println("Content: " + doc.select("p").text());
}
}
Parsing Files with Encoding
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.File;
import java.io.IOException;
public class FileEncodingExample {
public static void main(String[] args) {
try {
File htmlFile = new File("document.html");
// Parse file with specific encoding
Document doc = Jsoup.parse(htmlFile, "ISO-8859-1", "");
System.out.println(doc.title());
} catch (IOException e) {
e.printStackTrace();
}
}
}
Handling Web Requests with Encoding
Setting Charset for URL Connections
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public class WebEncodingExample {
public static void main(String[] args) {
try {
String url = "https://example.com";
// Force specific charset for connection
Document doc = Jsoup.connect(url)
.charset("UTF-8")
.userAgent("Mozilla/5.0 (compatible; jsoup)")
.get();
System.out.println("Title: " + doc.title());
} catch (IOException e) {
e.printStackTrace();
}
}
}
Detecting and Handling Wrong Encoding
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.Connection;
import java.io.IOException;
import java.nio.charset.Charset;
public class EncodingDetection {
public static void main(String[] args) {
try {
String url = "https://example.com";
// First, get response with auto-detection
Connection.Response response = Jsoup.connect(url)
.execute();
Document doc = response.parse();
// Check if content looks garbled (contains replacement characters)
String content = doc.text();
if (content.contains("�") || content.contains("?")) {
System.out.println("Encoding issue detected, trying different charset...");
// Try common encodings
String[] encodings = {"UTF-8", "ISO-8859-1", "Windows-1252", "UTF-16"};
for (String encoding : encodings) {
try {
Document retryDoc = Jsoup.connect(url)
.charset(encoding)
.get();
String retryContent = retryDoc.text();
if (!retryContent.contains("�")) {
System.out.println("Success with encoding: " + encoding);
doc = retryDoc;
break;
}
} catch (Exception e) {
System.out.println("Failed with encoding: " + encoding);
}
}
}
System.out.println("Final content: " + doc.title());
} catch (IOException e) {
e.printStackTrace();
}
}
}
Advanced Encoding Handling
Reading Bytes and Re-parsing
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.Connection;
import java.io.IOException;
import java.io.ByteArrayInputStream;
public class AdvancedEncodingExample {
public static void main(String[] args) {
try {
String url = "https://example.com";
// Get raw bytes without parsing
Connection.Response response = Jsoup.connect(url)
.ignoreContentType(true)
.execute();
byte[] bodyBytes = response.bodyAsBytes();
// Parse with specific encoding
Document doc = Jsoup.parse(
new ByteArrayInputStream(bodyBytes),
"UTF-8",
url
);
System.out.println("Parsed with custom encoding: " + doc.title());
} catch (IOException e) {
e.printStackTrace();
}
}
}
Setting Output Encoding
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities;
public class OutputEncodingExample {
public static void main(String[] args) {
// Create document with special characters
Document doc = Jsoup.parse("<div>Héllo Wörld! 你好</div>");
// Configure output settings
doc.outputSettings()
.charset("UTF-8")
.escapeMode(Entities.EscapeMode.extended)
.prettyPrint(true);
System.out.println("HTML output:");
System.out.println(doc.html());
// Alternative: minimal escaping for readability
doc.outputSettings().escapeMode(Entities.EscapeMode.minimal);
System.out.println("\nMinimal escaping:");
System.out.println(doc.body().html());
}
}
Practical Encoding Solutions
Universal Encoding Handler
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.Connection;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
public class UniversalEncodingHandler {
private static final List<String> COMMON_ENCODINGS = Arrays.asList(
"UTF-8", "ISO-8859-1", "Windows-1252", "UTF-16", "GB2312", "Shift_JIS"
);
public static Document parseWithBestEncoding(String url) throws IOException {
// Try auto-detection first
try {
Document doc = Jsoup.connect(url).get();
if (isValidEncoding(doc)) {
return doc;
}
} catch (Exception e) {
System.out.println("Auto-detection failed: " + e.getMessage());
}
// Try common encodings
for (String encoding : COMMON_ENCODINGS) {
try {
Document doc = Jsoup.connect(url)
.charset(encoding)
.timeout(10000)
.get();
if (isValidEncoding(doc)) {
System.out.println("Successfully parsed with: " + encoding);
return doc;
}
} catch (Exception e) {
System.out.println("Failed with " + encoding + ": " + e.getMessage());
}
}
throw new IOException("Could not parse document with any supported encoding");
}
private static boolean isValidEncoding(Document doc) {
String text = doc.text();
// Check for common signs of encoding issues
return !text.contains("�") &&
!text.contains("Ã") &&
text.length() > 0;
}
public static void main(String[] args) {
try {
Document doc = parseWithBestEncoding("https://example.com");
System.out.println("Title: " + doc.title());
} catch (IOException e) {
e.printStackTrace();
}
}
}
Common Encoding Issues and Solutions
Problem: Garbled Characters
Symptoms: Text shows as á
, é
, ’
, or �
Solution: The source is UTF-8 but being read as Latin-1
// Force UTF-8 encoding
Document doc = Jsoup.connect(url).charset("UTF-8").get();
Problem: Missing Characters
Symptoms: Some characters disappear completely Solution: Try different encoding or enable extended character escaping
doc.outputSettings()
.charset("UTF-8")
.escapeMode(Entities.EscapeMode.extended);
Problem: Server Lies About Encoding
Symptoms: HTTP header says UTF-8 but content is Latin-1 Solution: Override the charset detection
// Ignore server charset and force specific encoding
Document doc = Jsoup.connect(url)
.ignoreContentType(true)
.charset("ISO-8859-1")
.get();
Best Practices
- Always specify UTF-8 for new applications unless you have specific requirements
- Test with international content to verify encoding handles special characters
- Log encoding detection to help debug issues in production
- Validate extracted text for replacement characters (
�
) as a sign of encoding problems - Handle encoding gracefully with fallback mechanisms for robust applications
Character encoding issues are common in web scraping, but with proper detection and fallback strategies, jsoup can handle most encoding scenarios reliably.