How do I Parse HTML Content in Swift Applications?
Parsing HTML content in Swift applications is essential for many iOS and macOS projects, whether you're building a web scraper, content aggregator, or need to extract specific data from web pages. Swift provides several powerful libraries and approaches to handle HTML parsing efficiently and safely.
Popular HTML Parsing Libraries for Swift
1. SwiftSoup - The Recommended Solution
SwiftSoup is the most popular and feature-rich HTML parsing library for Swift, inspired by Java's JSoup. It provides a clean API for extracting and manipulating HTML data.
Installation
Add SwiftSoup to your project using Swift Package Manager:
dependencies: [
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.6.0")
]
Or add it via Xcode: File → Add Package Dependencies → Enter the URL above.
Basic HTML Parsing with SwiftSoup
import SwiftSoup
func parseHTMLWithSwiftSoup() {
let html = """
<html>
<head><title>Sample Page</title></head>
<body>
<div class="content">
<h1 id="main-title">Welcome to Swift</h1>
<p class="description">This is a sample paragraph.</p>
<ul>
<li><a href="https://swift.org">Swift Website</a></li>
<li><a href="https://developer.apple.com">Apple Developer</a></li>
</ul>
</div>
</body>
</html>
"""
do {
let doc = try SwiftSoup.parse(html)
// Extract title
let title = try doc.title()
print("Title: \(title)")
// Extract element by ID
let mainTitle = try doc.select("#main-title").first()?.text() ?? ""
print("Main Title: \(mainTitle)")
// Extract elements by class
let description = try doc.select(".description").first()?.text() ?? ""
print("Description: \(description)")
// Extract all links
let links = try doc.select("a")
for link in links {
let text = try link.text()
let href = try link.attr("href")
print("Link: \(text) -> \(href)")
}
} catch {
print("Error parsing HTML: \(error)")
}
}
2. Kanna - XML/HTML Parser
Kanna is another excellent choice for HTML parsing, particularly when you need XPath support.
Installation
Add Kanna via Swift Package Manager:
dependencies: [
.package(url: "https://github.com/tid-kijyun/Kanna.git", from: "5.2.7")
]
Parsing with Kanna
import Kanna
func parseHTMLWithKanna() {
let html = """
<html>
<body>
<div class="article">
<h2>Article Title</h2>
<p>Article content goes here.</p>
<span class="author">John Doe</span>
</div>
</body>
</html>
"""
if let doc = try? HTML(html: html, encoding: .utf8) {
// CSS Selectors
for article in doc.css("div.article") {
let title = article.css("h2").first?.text ?? ""
let content = article.css("p").first?.text ?? ""
let author = article.css(".author").first?.text ?? ""
print("Title: \(title)")
print("Content: \(content)")
print("Author: \(author)")
}
// XPath (Kanna's advantage)
if let titleNode = doc.at_xpath("//h2") {
print("Title via XPath: \(titleNode.text ?? "")")
}
}
}
Fetching and Parsing Remote HTML Content
Using URLSession with SwiftSoup
import Foundation
import SwiftSoup
class HTMLParser {
func fetchAndParseHTML(from urlString: String, completion: @escaping (Result<[String], Error>) -> Void) {
guard let url = URL(string: urlString) else {
completion(.failure(NSError(domain: "Invalid URL", code: 0, userInfo: nil)))
return
}
let task = URLSession.shared.dataTask(with: url) { data, response, error in
if let error = error {
completion(.failure(error))
return
}
guard let data = data,
let htmlString = String(data: data, encoding: .utf8) else {
completion(.failure(NSError(domain: "No data", code: 0, userInfo: nil)))
return
}
do {
let doc = try SwiftSoup.parse(htmlString)
let titles = try doc.select("h1, h2, h3").map { try $0.text() }
completion(.success(titles))
} catch {
completion(.failure(error))
}
}
task.resume()
}
}
// Usage
let parser = HTMLParser()
parser.fetchAndParseHTML(from: "https://example.com") { result in
switch result {
case .success(let titles):
print("Found titles: \(titles)")
case .failure(let error):
print("Error: \(error)")
}
}
Advanced HTML Parsing Techniques
1. Extracting Data from Tables
func parseTableData(html: String) {
do {
let doc = try SwiftSoup.parse(html)
let table = try doc.select("table").first()
guard let tableElement = table else { return }
let rows = try tableElement.select("tr")
var tableData: [[String]] = []
for row in rows {
let cells = try row.select("td, th")
let rowData = try cells.map { try $0.text() }
tableData.append(rowData)
}
// Print table data
for (index, row) in tableData.enumerated() {
print("Row \(index): \(row)")
}
} catch {
print("Error parsing table: \(error)")
}
}
2. Handling Forms and Input Elements
func extractFormData(html: String) {
do {
let doc = try SwiftSoup.parse(html)
let forms = try doc.select("form")
for form in forms {
let action = try form.attr("action")
let method = try form.attr("method")
print("Form Action: \(action), Method: \(method)")
let inputs = try form.select("input")
for input in inputs {
let name = try input.attr("name")
let type = try input.attr("type")
let value = try input.attr("value")
print("Input - Name: \(name), Type: \(type), Value: \(value)")
}
}
} catch {
print("Error extracting form data: \(error)")
}
}
3. Custom Data Extraction with CSS Selectors
struct Article {
let title: String
let content: String
let publishDate: String
let author: String
}
func extractArticles(from html: String) -> [Article] {
var articles: [Article] = []
do {
let doc = try SwiftSoup.parse(html)
let articleElements = try doc.select("article")
for element in articleElements {
let title = try element.select("h1, h2").first()?.text() ?? ""
let content = try element.select("p").map { try $0.text() }.joined(separator: " ")
let publishDate = try element.select(".publish-date").first()?.text() ?? ""
let author = try element.select(".author").first()?.text() ?? ""
let article = Article(
title: title,
content: content,
publishDate: publishDate,
author: author
)
articles.append(article)
}
} catch {
print("Error extracting articles: \(error)")
}
return articles
}
Error Handling and Best Practices
1. Robust Error Handling
enum HTMLParsingError: Error {
case invalidURL
case noData
case parsingFailed(String)
case elementNotFound(String)
}
func safeHTMLParsing(html: String) throws -> String {
do {
let doc = try SwiftSoup.parse(html)
guard let titleElement = try doc.select("title").first() else {
throw HTMLParsingError.elementNotFound("title")
}
return try titleElement.text()
} catch let swiftSoupError {
throw HTMLParsingError.parsingFailed(swiftSoupError.localizedDescription)
}
}
2. Performance Optimization
class OptimizedHTMLParser {
private let cache = NSCache<NSString, NSString>()
func parseWithCaching(html: String, selector: String) -> String? {
let cacheKey = "\(html.hashValue)-\(selector)" as NSString
if let cached = cache.object(forKey: cacheKey) {
return cached as String
}
do {
let doc = try SwiftSoup.parse(html)
let result = try doc.select(selector).first()?.text() ?? ""
cache.setObject(result as NSString, forKey: cacheKey)
return result
} catch {
return nil
}
}
}
Integration with Web Scraping APIs
For production applications that require large-scale HTML parsing and web scraping, consider integrating with specialized APIs. While tools like Puppeteer handle JavaScript-heavy websites, Swift applications can benefit from API-based solutions for consistent and reliable data extraction.
struct WebScrapingAPIClient {
private let baseURL = "https://api.webscraping.ai/html"
private let apiKey: String
init(apiKey: String) {
self.apiKey = apiKey
}
func scrapeHTML(url: String, completion: @escaping (Result<String, Error>) -> Void) {
var components = URLComponents(string: baseURL)!
components.queryItems = [
URLQueryItem(name: "url", value: url),
URLQueryItem(name: "api_key", value: apiKey)
]
guard let requestURL = components.url else {
completion(.failure(HTMLParsingError.invalidURL))
return
}
URLSession.shared.dataTask(with: requestURL) { data, response, error in
if let error = error {
completion(.failure(error))
return
}
guard let data = data,
let html = String(data: data, encoding: .utf8) else {
completion(.failure(HTMLParsingError.noData))
return
}
completion(.success(html))
}.resume()
}
}
Testing HTML Parsing Logic
import XCTest
@testable import YourApp
class HTMLParsingTests: XCTestCase {
func testBasicHTMLParsing() {
let html = "<html><head><title>Test</title></head><body><h1>Hello</h1></body></html>"
do {
let doc = try SwiftSoup.parse(html)
let title = try doc.title()
let heading = try doc.select("h1").first()?.text()
XCTAssertEqual(title, "Test")
XCTAssertEqual(heading, "Hello")
} catch {
XCTFail("Parsing failed: \(error)")
}
}
func testCSSSelectorParsing() {
let html = """
<div class="container">
<p class="highlight">Important text</p>
<p>Regular text</p>
</div>
"""
do {
let doc = try SwiftSoup.parse(html)
let highlightedText = try doc.select(".highlight").first()?.text()
XCTAssertEqual(highlightedText, "Important text")
} catch {
XCTFail("CSS selector parsing failed: \(error)")
}
}
}
Conclusion
HTML parsing in Swift applications is straightforward with the right tools and approaches. SwiftSoup provides the most comprehensive solution for most use cases, while Kanna offers excellent XPath support. For production applications requiring robust web scraping capabilities, consider combining local HTML parsing with API-based solutions for optimal performance and reliability.
Remember to always handle errors gracefully, implement proper caching strategies for performance, and test your parsing logic thoroughly. Whether you're building a simple content reader or a complex web scraping application, these techniques will help you efficiently extract and process HTML content in your Swift projects.