How do I implement retry logic for failed requests in Swift scraping?
Implementing robust retry logic is essential for building reliable Swift web scraping applications. Network requests can fail due to various reasons including temporary server issues, rate limiting, network connectivity problems, or transient errors. A well-designed retry mechanism ensures your scraping operations are resilient and can recover from temporary failures automatically.
Understanding Retry Strategies
Before implementing retry logic, it's important to understand different retry strategies:
1. Simple Retry
The most basic approach that retries a fixed number of times with no delay between attempts.
2. Fixed Delay Retry
Waits for a fixed amount of time between retry attempts.
3. Exponential Backoff
Increases the delay exponentially with each retry attempt, often with jitter to prevent thundering herd problems.
4. Linear Backoff
Increases the delay linearly with each retry attempt.
Basic Retry Implementation
Here's a fundamental retry mechanism using Swift's async/await pattern:
import Foundation
enum RetryError: Error {
case maxRetriesExceeded
case nonRetriableError(Error)
}
class RetryHandler {
private let maxRetries: Int
private let baseDelay: TimeInterval
init(maxRetries: Int = 3, baseDelay: TimeInterval = 1.0) {
self.maxRetries = maxRetries
self.baseDelay = baseDelay
}
func retry<T>(
operation: @escaping () async throws -> T,
shouldRetry: @escaping (Error) -> Bool = { _ in true }
) async throws -> T {
var lastError: Error?
for attempt in 0...maxRetries {
do {
return try await operation()
} catch {
lastError = error
// Don't retry if it's the last attempt or error is not retriable
if attempt == maxRetries || !shouldRetry(error) {
throw error
}
// Calculate delay for next attempt
let delay = calculateDelay(attempt: attempt)
try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
}
}
throw lastError ?? RetryError.maxRetriesExceeded
}
private func calculateDelay(attempt: Int) -> TimeInterval {
// Exponential backoff with jitter
let exponentialDelay = baseDelay * pow(2.0, Double(attempt))
let jitter = Double.random(in: 0...0.1) * exponentialDelay
return exponentialDelay + jitter
}
}
Advanced Retry Implementation with URLSession
Here's a more sophisticated implementation specifically designed for web scraping with URLSession:
import Foundation
class WebScrapingRetryHandler {
struct RetryConfiguration {
let maxRetries: Int
let baseDelay: TimeInterval
let maxDelay: TimeInterval
let retryableStatusCodes: Set<Int>
let retryableErrors: Set<URLError.Code>
static let `default` = RetryConfiguration(
maxRetries: 3,
baseDelay: 1.0,
maxDelay: 30.0,
retryableStatusCodes: [408, 429, 500, 502, 503, 504],
retryableErrors: [
.timedOut,
.cannotConnectToHost,
.networkConnectionLost,
.dnsLookupFailed
]
)
}
private let configuration: RetryConfiguration
private let session: URLSession
init(configuration: RetryConfiguration = .default, session: URLSession = .shared) {
self.configuration = configuration
self.session = session
}
func performRequest(_ request: URLRequest) async throws -> (Data, HTTPURLResponse) {
return try await retry {
let (data, response) = try await self.session.data(for: request)
guard let httpResponse = response as? HTTPURLResponse else {
throw URLError(.badServerResponse)
}
// Check if status code indicates a retriable error
if self.configuration.retryableStatusCodes.contains(httpResponse.statusCode) {
throw HTTPError.retriableStatusCode(httpResponse.statusCode)
}
// Check for successful status codes (200-299)
guard (200...299).contains(httpResponse.statusCode) else {
throw HTTPError.nonRetriableStatusCode(httpResponse.statusCode)
}
return (data, httpResponse)
}
}
private func retry<T>(operation: @escaping () async throws -> T) async throws -> T {
var lastError: Error?
for attempt in 0...configuration.maxRetries {
do {
return try await operation()
} catch {
lastError = error
// Don't retry if it's the last attempt or error is not retriable
if attempt == configuration.maxRetries || !isRetriable(error: error) {
throw error
}
// Calculate and apply delay
let delay = calculateDelay(attempt: attempt)
print("Request failed (attempt \(attempt + 1)/\(configuration.maxRetries + 1)). Retrying in \(delay)s...")
try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
}
}
throw lastError ?? RetryError.maxRetriesExceeded
}
private func isRetriable(error: Error) -> Bool {
switch error {
case let urlError as URLError:
return configuration.retryableErrors.contains(urlError.code)
case let httpError as HTTPError:
switch httpError {
case .retriableStatusCode:
return true
case .nonRetriableStatusCode:
return false
}
default:
return false
}
}
private func calculateDelay(attempt: Int) -> TimeInterval {
// Exponential backoff with jitter and maximum delay cap
let exponentialDelay = configuration.baseDelay * pow(2.0, Double(attempt))
let jitter = Double.random(in: 0...0.1) * exponentialDelay
let totalDelay = exponentialDelay + jitter
return min(totalDelay, configuration.maxDelay)
}
}
enum HTTPError: Error {
case retriableStatusCode(Int)
case nonRetriableStatusCode(Int)
}
Practical Usage Examples
Basic Web Scraping with Retry Logic
import Foundation
import SwiftSoup
class WebScraper {
private let retryHandler: WebScrapingRetryHandler
init() {
self.retryHandler = WebScrapingRetryHandler()
}
func scrapeWebpage(url: URL) async throws -> [String] {
let request = URLRequest(url: url)
let (data, _) = try await retryHandler.performRequest(request)
guard let html = String(data: data, encoding: .utf8) else {
throw ScrapingError.invalidEncoding
}
let document = try SwiftSoup.parse(html)
let titles = try document.select("h1, h2, h3").map { try $0.text() }
return titles
}
func scrapeMultiplePages(urls: [URL]) async throws -> [String: [String]] {
var results: [String: [String]] = [:]
// Process URLs concurrently with retry logic
await withTaskGroup(of: (String, Result<[String], Error>).self) { group in
for url in urls {
group.addTask {
do {
let titles = try await self.scrapeWebpage(url: url)
return (url.absoluteString, .success(titles))
} catch {
return (url.absoluteString, .failure(error))
}
}
}
for await (urlString, result) in group {
switch result {
case .success(let titles):
results[urlString] = titles
case .failure(let error):
print("Failed to scrape \(urlString): \(error)")
results[urlString] = []
}
}
}
return results
}
}
enum ScrapingError: Error {
case invalidEncoding
case parsingFailed
}
Custom Retry Configuration for Different Scenarios
// Configuration for rate-limited APIs
let rateLimitConfig = WebScrapingRetryHandler.RetryConfiguration(
maxRetries: 5,
baseDelay: 2.0,
maxDelay: 60.0,
retryableStatusCodes: [429, 503], // Focus on rate limiting and service unavailable
retryableErrors: [.timedOut, .cannotConnectToHost]
)
// Configuration for unreliable networks
let unreliableNetworkConfig = WebScrapingRetryHandler.RetryConfiguration(
maxRetries: 10,
baseDelay: 0.5,
maxDelay: 30.0,
retryableStatusCodes: [408, 500, 502, 503, 504],
retryableErrors: [
.timedOut,
.cannotConnectToHost,
.networkConnectionLost,
.dnsLookupFailed,
.notConnectedToInternet
]
)
// Usage with custom configuration
let rateLimitHandler = WebScrapingRetryHandler(configuration: rateLimitConfig)
let unreliableNetworkHandler = WebScrapingRetryHandler(configuration: unreliableNetworkConfig)
Monitoring and Logging
Implementing proper monitoring and logging is crucial for understanding retry patterns and optimizing your scraping operations:
class MonitoredRetryHandler: WebScrapingRetryHandler {
private var retryMetrics: [String: Int] = [:]
override func performRequest(_ request: URLRequest) async throws -> (Data, HTTPURLResponse) {
let startTime = Date()
let url = request.url?.absoluteString ?? "unknown"
do {
let result = try await super.performRequest(request)
let duration = Date().timeIntervalSince(startTime)
logSuccess(url: url, duration: duration)
return result
} catch {
let duration = Date().timeIntervalSince(startTime)
logFailure(url: url, error: error, duration: duration)
throw error
}
}
private func logSuccess(url: String, duration: TimeInterval) {
print("ā
Successfully scraped \(url) in \(String(format: "%.2f", duration))s")
}
private func logFailure(url: String, error: Error, duration: TimeInterval) {
retryMetrics[url, default: 0] += 1
print("ā Failed to scrape \(url) after \(String(format: "%.2f", duration))s: \(error)")
}
func printMetrics() {
print("\nš Retry Metrics:")
for (url, count) in retryMetrics.sorted(by: { $0.value > $1.value }) {
print(" \(url): \(count) failures")
}
}
}
Handling Rate Limiting with Retry-After Headers
Many APIs and websites provide a Retry-After
header when rate limiting occurs. Here's how to respect these headers:
extension WebScrapingRetryHandler {
private func extractRetryAfter(from response: HTTPURLResponse) -> TimeInterval? {
guard let retryAfterHeader = response.value(forHTTPHeaderField: "Retry-After") else {
return nil
}
// Retry-After can be in seconds (number) or HTTP date format
if let seconds = TimeInterval(retryAfterHeader) {
return seconds
}
// Try parsing as HTTP date
let formatter = DateFormatter()
formatter.dateFormat = "EEE, dd MMM yyyy HH:mm:ss zzz"
if let date = formatter.date(from: retryAfterHeader) {
return date.timeIntervalSinceNow
}
return nil
}
private func calculateDelayWithRetryAfter(attempt: Int, response: HTTPURLResponse?) -> TimeInterval {
// Check for Retry-After header first
if let response = response,
response.statusCode == 429,
let retryAfter = extractRetryAfter(from: response) {
return max(retryAfter, 1.0) // Minimum 1 second delay
}
// Fall back to exponential backoff
return calculateDelay(attempt: attempt)
}
}
Best Practices and Considerations
1. Error Classification
Always distinguish between retriable and non-retriable errors. Don't retry on authentication failures (401), forbidden access (403), or not found errors (404).
2. Respect Rate Limits
When encountering 429 (Too Many Requests) responses, implement exponential backoff and consider reading the Retry-After
header if provided.
3. Timeout Configuration
Set appropriate timeouts for your requests to avoid hanging indefinitely:
var request = URLRequest(url: url)
request.timeoutInterval = 30.0 // 30 seconds timeout
4. Circuit Breaker Pattern
For production applications, consider implementing a circuit breaker pattern to temporarily stop making requests to failing services.
5. Concurrent Request Management
When scraping multiple pages concurrently, be mindful of the total number of simultaneous requests to avoid overwhelming the target server.
Error Recovery Strategies
Different types of errors require different recovery strategies:
enum RecoveryStrategy {
case retry
case skipAndContinue
case failImmediately
case useAlternativeEndpoint
}
extension WebScrapingRetryHandler {
func determineRecoveryStrategy(for error: Error) -> RecoveryStrategy {
switch error {
case let urlError as URLError:
switch urlError.code {
case .timedOut, .cannotConnectToHost, .networkConnectionLost:
return .retry
case .cancelled:
return .failImmediately
case .badURL, .unsupportedURL:
return .skipAndContinue
default:
return .retry
}
case let httpError as HTTPError:
switch httpError {
case .retriableStatusCode(let code):
return code == 503 ? .useAlternativeEndpoint : .retry
case .nonRetriableStatusCode:
return .skipAndContinue
}
default:
return .retry
}
}
}
Testing Retry Logic
Testing retry mechanisms is crucial for ensuring reliability:
import XCTest
class RetryHandlerTests: XCTestCase {
func testExponentialBackoff() async throws {
let handler = RetryHandler(maxRetries: 3, baseDelay: 0.1)
var attempts = 0
do {
try await handler.retry {
attempts += 1
if attempts < 3 {
throw URLError(.timedOut)
}
return "Success"
}
} catch {
XCTFail("Should have succeeded after retries")
}
XCTAssertEqual(attempts, 3)
}
func testNonRetriableError() async {
let handler = RetryHandler(maxRetries: 3)
var attempts = 0
do {
try await handler.retry(
operation: {
attempts += 1
throw URLError(.badURL)
},
shouldRetry: { error in
guard let urlError = error as? URLError else { return true }
return urlError.code != .badURL
}
)
XCTFail("Should have failed immediately")
} catch {
XCTAssertEqual(attempts, 1)
}
}
}
Conclusion
Implementing robust retry logic is essential for reliable Swift web scraping applications. The examples provided show how to create flexible, configurable retry mechanisms that can handle various failure scenarios. Remember to always respect the target website's terms of service and implement appropriate delays and rate limiting to be a good web citizen.
For more advanced error handling patterns, you might also want to explore how to handle timeouts in Puppeteer for additional insights into timeout management strategies, or learn about handling errors in Puppeteer for comprehensive error handling approaches that can be adapted to Swift implementations.
By following these patterns and best practices, you'll build more resilient web scraping applications that can gracefully handle network failures and temporary service disruptions.