What are the best practices for error handling in Swift web scraping?
Error handling is crucial for building robust web scraping applications in Swift. Proper error management ensures your scraping operations can gracefully handle network failures, unexpected responses, and various edge cases that commonly occur during web scraping.
Understanding Common Swift Web Scraping Errors
Swift web scraping applications typically encounter several types of errors:
Network Connection Errors
- Connection timeouts
- DNS resolution failures
- SSL/TLS certificate errors
- Network unavailability
HTTP Response Errors
- 4xx client errors (404, 403, 429)
- 5xx server errors (500, 502, 503)
- Malformed responses
- Unexpected content types
Data Processing Errors
- JSON/XML parsing failures
- HTML structure changes
- Character encoding issues
- Memory constraints
Implementing URLSession Error Handling
URLSession is the foundation of Swift web scraping. Here's how to implement comprehensive error handling:
import Foundation
enum ScrapingError: Error {
case invalidURL
case noData
case networkError(Error)
case httpError(Int)
case parsingError(Error)
case timeout
case rateLimited
case unknownError
}
class WebScraper {
private let session: URLSession
init() {
let config = URLSessionConfiguration.default
config.timeoutIntervalForRequest = 30
config.timeoutIntervalForResource = 60
self.session = URLSession(configuration: config)
}
func scrapeURL(_ urlString: String) async throws -> Data {
guard let url = URL(string: urlString) else {
throw ScrapingError.invalidURL
}
do {
let (data, response) = try await session.data(from: url)
// Check HTTP status codes
if let httpResponse = response as? HTTPURLResponse {
switch httpResponse.statusCode {
case 200...299:
return data
case 429:
throw ScrapingError.rateLimited
case 400...499:
throw ScrapingError.httpError(httpResponse.statusCode)
case 500...599:
throw ScrapingError.httpError(httpResponse.statusCode)
default:
throw ScrapingError.unknownError
}
}
return data
} catch let error as URLError {
switch error.code {
case .timedOut:
throw ScrapingError.timeout
case .notConnectedToInternet, .networkConnectionLost:
throw ScrapingError.networkError(error)
default:
throw ScrapingError.networkError(error)
}
} catch {
throw ScrapingError.unknownError
}
}
}
Implementing Retry Logic with Exponential Backoff
Retry mechanisms are essential for handling temporary failures:
class RetryableWebScraper {
private let scraper = WebScraper()
private let maxRetries: Int
private let baseDelay: TimeInterval
init(maxRetries: Int = 3, baseDelay: TimeInterval = 1.0) {
self.maxRetries = maxRetries
self.baseDelay = baseDelay
}
func scrapeWithRetry(_ url: String) async throws -> Data {
var lastError: Error?
for attempt in 0...maxRetries {
do {
return try await scraper.scrapeURL(url)
} catch ScrapingError.rateLimited {
// For rate limiting, wait longer
let delay = baseDelay * pow(2.0, Double(attempt)) * 2
try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
lastError = ScrapingError.rateLimited
} catch ScrapingError.timeout, ScrapingError.networkError {
// Retry on network issues
if attempt < maxRetries {
let delay = baseDelay * pow(2.0, Double(attempt))
try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
lastError = ScrapingError.timeout
} else {
throw ScrapingError.timeout
}
} catch {
// Don't retry on other errors
throw error
}
}
throw lastError ?? ScrapingError.unknownError
}
}
Handling HTML Parsing Errors
When parsing HTML content, implement robust error handling:
import SwiftSoup
extension WebScraper {
func parseHTML(_ data: Data, selector: String) throws -> [String] {
guard let html = String(data: data, encoding: .utf8) else {
throw ScrapingError.parsingError(NSError(
domain: "EncodingError",
code: 1,
userInfo: [NSLocalizedDescriptionKey: "Unable to decode HTML"]
))
}
do {
let document = try SwiftSoup.parse(html)
let elements = try document.select(selector)
return try elements.array().map { try $0.text() }
} catch {
throw ScrapingError.parsingError(error)
}
}
func extractDataSafely(_ data: Data) -> ScrapingResult {
var result = ScrapingResult()
// Try multiple parsing strategies
do {
result.title = try parseHTML(data, selector: "title").first
} catch {
print("Warning: Could not extract title - \(error)")
}
do {
result.links = try parseHTML(data, selector: "a[href]")
} catch {
print("Warning: Could not extract links - \(error)")
result.links = []
}
return result
}
}
struct ScrapingResult {
var title: String?
var links: [String] = []
var errors: [String] = []
}
Implementing Circuit Breaker Pattern
For production applications, implement a circuit breaker to prevent cascading failures:
class CircuitBreaker {
enum State {
case closed
case open
case halfOpen
}
private var state: State = .closed
private var failureCount = 0
private var lastFailureTime: Date?
private let failureThreshold: Int
private let recoveryTimeout: TimeInterval
init(failureThreshold: Int = 5, recoveryTimeout: TimeInterval = 60) {
self.failureThreshold = failureThreshold
self.recoveryTimeout = recoveryTimeout
}
func execute<T>(_ operation: () async throws -> T) async throws -> T {
switch state {
case .open:
if shouldAttemptReset() {
state = .halfOpen
} else {
throw ScrapingError.networkError(NSError(
domain: "CircuitBreakerError",
code: 1,
userInfo: [NSLocalizedDescriptionKey: "Circuit breaker is open"]
))
}
case .halfOpen, .closed:
break
}
do {
let result = try await operation()
onSuccess()
return result
} catch {
onFailure()
throw error
}
}
private func onSuccess() {
failureCount = 0
state = .closed
}
private func onFailure() {
failureCount += 1
lastFailureTime = Date()
if failureCount >= failureThreshold {
state = .open
}
}
private func shouldAttemptReset() -> Bool {
guard let lastFailure = lastFailureTime else { return false }
return Date().timeIntervalSince(lastFailure) >= recoveryTimeout
}
}
Comprehensive Error Logging and Monitoring
Implement detailed logging for debugging and monitoring:
import os.log
class ScrapingLogger {
private let logger = Logger(subsystem: "com.yourapp.webscraper", category: "scraping")
func logError(_ error: ScrapingError, url: String, context: [String: Any] = [:]) {
let contextString = context.map { "\($0.key): \($0.value)" }.joined(separator: ", ")
switch error {
case .networkError(let underlyingError):
logger.error("Network error for \(url): \(underlyingError.localizedDescription). Context: \(contextString)")
case .httpError(let statusCode):
logger.error("HTTP error \(statusCode) for \(url). Context: \(contextString)")
case .timeout:
logger.warning("Timeout for \(url). Context: \(contextString)")
case .rateLimited:
logger.warning("Rate limited for \(url). Context: \(contextString)")
case .parsingError(let parseError):
logger.error("Parsing error for \(url): \(parseError.localizedDescription). Context: \(contextString)")
default:
logger.error("Unknown error for \(url): \(error). Context: \(contextString)")
}
}
func logSuccess(url: String, duration: TimeInterval, dataSize: Int) {
logger.info("Successfully scraped \(url) in \(duration)s, \(dataSize) bytes")
}
}
Handling Rate Limiting and Throttling
Implement smart rate limiting to avoid being blocked:
actor RateLimiter {
private var lastRequestTime: Date?
private let minInterval: TimeInterval
init(requestsPerSecond: Double) {
self.minInterval = 1.0 / requestsPerSecond
}
func waitIfNeeded() async {
if let lastTime = lastRequestTime {
let elapsed = Date().timeIntervalSince(lastTime)
if elapsed < minInterval {
let waitTime = minInterval - elapsed
try? await Task.sleep(nanoseconds: UInt64(waitTime * 1_000_000_000))
}
}
lastRequestTime = Date()
}
}
class ThrottledScraper {
private let scraper = RetryableWebScraper()
private let rateLimiter = RateLimiter(requestsPerSecond: 2.0)
private let logger = ScrapingLogger()
func scrapeWithThrottling(_ url: String) async throws -> Data {
await rateLimiter.waitIfNeeded()
let startTime = Date()
do {
let data = try await scraper.scrapeWithRetry(url)
let duration = Date().timeIntervalSince(startTime)
logger.logSuccess(url: url, duration: duration, dataSize: data.count)
return data
} catch let error as ScrapingError {
logger.logError(error, url: url, context: ["duration": Date().timeIntervalSince(startTime)])
throw error
}
}
}
Best Practices Summary
- Use Custom Error Types: Define specific error types for different failure scenarios
- Implement Retry Logic: Use exponential backoff for transient failures
- Handle HTTP Status Codes: Check and handle different HTTP response codes appropriately
- Add Comprehensive Logging: Log errors with context for debugging
- Implement Rate Limiting: Respect server resources and avoid being blocked
- Use Circuit Breakers: Prevent cascading failures in production systems
- Graceful Degradation: Continue processing even when some data extraction fails
- Monitor Performance: Track success rates, error frequencies, and response times
Similar to how to handle errors in Puppeteer, Swift web scraping requires a multi-layered approach to error handling. Just as handling timeouts in Puppeteer is crucial for JavaScript applications, proper timeout management in Swift ensures your scraping operations don't hang indefinitely.
By implementing these error handling practices, your Swift web scraping applications will be more reliable, maintainable, and production-ready. Remember to always test your error handling code paths and monitor your applications in production to identify and address new error scenarios as they arise.