How do I handle timeouts and network errors in Swift web scraping?
Handling timeouts and network errors is crucial for building robust Swift web scraping applications. Swift's URLSession provides comprehensive tools for managing network requests, but proper error handling and timeout configuration require careful implementation. This guide covers best practices for handling various network scenarios, implementing retry mechanisms, and creating resilient scraping applications.
Understanding URLSession Timeout Configuration
URLSession offers multiple timeout settings that control different aspects of network requests:
import Foundation
class WebScrapingManager {
private let session: URLSession
init() {
let configuration = URLSessionConfiguration.default
// Timeout for establishing connection
configuration.timeoutIntervalForRequest = 30.0
// Timeout for entire request including data transfer
configuration.timeoutIntervalForResource = 120.0
// Connection pooling and reuse
configuration.httpMaximumConnectionsPerHost = 5
configuration.urlCache = URLCache(
memoryCapacity: 50 * 1024 * 1024, // 50 MB
diskCapacity: 200 * 1024 * 1024, // 200 MB
diskPath: nil
)
self.session = URLSession(configuration: configuration)
}
}
Comprehensive Error Handling
Implement robust error handling to manage various network failure scenarios:
enum ScrapingError: Error, LocalizedError {
case invalidURL
case noData
case networkTimeout
case serverError(Int)
case decodingError
case rateLimited
case connectionFailed
var errorDescription: String? {
switch self {
case .invalidURL:
return "Invalid URL provided"
case .noData:
return "No data received from server"
case .networkTimeout:
return "Request timed out"
case .serverError(let code):
return "Server error with code: \(code)"
case .decodingError:
return "Failed to decode response data"
case .rateLimited:
return "Rate limit exceeded"
case .connectionFailed:
return "Network connection failed"
}
}
}
extension WebScrapingManager {
func scrapeURL(_ urlString: String) async throws -> Data {
guard let url = URL(string: urlString) else {
throw ScrapingError.invalidURL
}
var request = URLRequest(url: url)
request.httpMethod = "GET"
request.setValue("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
forHTTPHeaderField: "User-Agent")
do {
let (data, response) = try await session.data(for: request)
guard let httpResponse = response as? HTTPURLResponse else {
throw ScrapingError.connectionFailed
}
switch httpResponse.statusCode {
case 200...299:
return data
case 429:
throw ScrapingError.rateLimited
case 400...499:
throw ScrapingError.serverError(httpResponse.statusCode)
case 500...599:
throw ScrapingError.serverError(httpResponse.statusCode)
default:
throw ScrapingError.serverError(httpResponse.statusCode)
}
} catch let error as URLError {
switch error.code {
case .timedOut:
throw ScrapingError.networkTimeout
case .notConnectedToInternet, .networkConnectionLost:
throw ScrapingError.connectionFailed
default:
throw ScrapingError.connectionFailed
}
}
}
}
Implementing Retry Mechanisms
Create intelligent retry logic with exponential backoff for handling temporary failures:
extension WebScrapingManager {
func scrapeWithRetry(_ urlString: String, maxRetries: Int = 3) async throws -> Data {
var lastError: Error?
for attempt in 0..<maxRetries {
do {
return try await scrapeURL(urlString)
} catch ScrapingError.networkTimeout, ScrapingError.connectionFailed {
lastError = error
// Exponential backoff: 1s, 2s, 4s
let delay = pow(2.0, Double(attempt))
try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
print("Retry attempt \(attempt + 1) for \(urlString)")
} catch ScrapingError.rateLimited {
lastError = error
// Longer delay for rate limiting
let delay = Double(attempt + 1) * 5.0
try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
} catch {
// Don't retry for other errors (invalid URL, server errors, etc.)
throw error
}
}
throw lastError ?? ScrapingError.connectionFailed
}
}
Advanced Timeout Handling with Custom Delegates
For more granular control over network behavior, implement custom URLSessionDelegate:
class CustomSessionDelegate: NSObject, URLSessionDelegate, URLSessionTaskDelegate {
func urlSession(_ session: URLSession, task: URLSessionTask,
didCompleteWithError error: Error?) {
if let error = error as? URLError {
switch error.code {
case .timedOut:
print("Request timed out: \(task.originalRequest?.url?.absoluteString ?? "")")
case .networkConnectionLost:
print("Network connection lost during request")
default:
print("Network error: \(error.localizedDescription)")
}
}
}
func urlSession(_ session: URLSession, task: URLSessionTask,
willPerformHTTPRedirection response: HTTPURLResponse,
newRequest request: URLRequest,
completionHandler: @escaping (URLRequest?) -> Void) {
// Handle redirects with custom logic
print("Redirecting to: \(request.url?.absoluteString ?? "")")
completionHandler(request)
}
}
// Usage
let delegate = CustomSessionDelegate()
let configuration = URLSessionConfiguration.default
configuration.timeoutIntervalForRequest = 30.0
let sessionWithDelegate = URLSession(configuration: configuration,
delegate: delegate,
delegateQueue: nil)
Concurrent Scraping with Error Handling
Implement concurrent scraping while managing timeouts and errors effectively:
actor ScrapingCoordinator {
private let webScrapingManager = WebScrapingManager()
private var activeRequests: Set<String> = []
func scrapeMultipleURLs(_ urls: [String]) async -> [String: Result<Data, Error>] {
var results: [String: Result<Data, Error>] = [:]
await withTaskGroup(of: (String, Result<Data, Error>).self) { group in
for url in urls {
group.addTask { [weak self] in
guard let self = self else {
return (url, .failure(ScrapingError.connectionFailed))
}
do {
let data = try await self.webScrapingManager.scrapeWithRetry(url)
return (url, .success(data))
} catch {
return (url, .failure(error))
}
}
}
for await (url, result) in group {
results[url] = result
}
}
return results
}
}
Circuit Breaker Pattern
Implement a circuit breaker to prevent cascading failures:
actor CircuitBreaker {
private enum State {
case closed
case open
case halfOpen
}
private var state: State = .closed
private var failureCount = 0
private var lastFailureTime: Date?
private let failureThreshold = 5
private let recoveryTimeout: TimeInterval = 60.0
func execute<T>(_ operation: () async throws -> T) async throws -> T {
switch state {
case .open:
guard let lastFailure = lastFailureTime,
Date().timeIntervalSince(lastFailure) > recoveryTimeout else {
throw ScrapingError.connectionFailed
}
state = .halfOpen
case .halfOpen, .closed:
break
}
do {
let result = try await operation()
await onSuccess()
return result
} catch {
await onFailure()
throw error
}
}
private func onSuccess() {
failureCount = 0
state = .closed
}
private func onFailure() {
failureCount += 1
lastFailureTime = Date()
if failureCount >= failureThreshold {
state = .open
}
}
}
Real-World Usage Example
Here's a complete example demonstrating proper timeout and error handling:
class WebScrapingService {
private let scrapingManager = WebScrapingManager()
private let circuitBreaker = CircuitBreaker()
func scrapeWebsite(_ urlString: String) async {
do {
let data = try await circuitBreaker.execute {
try await scrapingManager.scrapeWithRetry(urlString)
}
// Process the scraped data
if let htmlContent = String(data: data, encoding: .utf8) {
print("Successfully scraped \(htmlContent.count) characters")
// Parse HTML content here
}
} catch ScrapingError.networkTimeout {
print("Request timed out. Consider increasing timeout or checking network connectivity.")
} catch ScrapingError.rateLimited {
print("Rate limited. Implementing longer delays between requests.")
} catch ScrapingError.serverError(let code) {
print("Server returned error code: \(code). Check if the endpoint is available.")
} catch {
print("Scraping failed with error: \(error.localizedDescription)")
}
}
}
// Usage
let service = WebScrapingService()
await service.scrapeWebsite("https://example.com")
Monitoring and Logging
Implement comprehensive logging for network operations:
import os.log
extension WebScrapingManager {
private static let logger = Logger(subsystem: "com.yourapp.webscraping",
category: "networking")
func scrapeWithLogging(_ urlString: String) async throws -> Data {
Self.logger.info("Starting request to: \(urlString)")
let startTime = Date()
do {
let data = try await scrapeWithRetry(urlString)
let duration = Date().timeIntervalSince(startTime)
Self.logger.info("Request completed in \(duration)s, received \(data.count) bytes")
return data
} catch {
let duration = Date().timeIntervalSince(startTime)
Self.logger.error("Request failed after \(duration)s: \(error.localizedDescription)")
throw error
}
}
}
Best Practices
- Set appropriate timeouts: Configure both request and resource timeouts based on your specific use case
- Implement exponential backoff: Use progressive delays for retry attempts to avoid overwhelming servers
- Handle specific error types: Different errors require different handling strategies
- Use circuit breakers: Prevent cascading failures in distributed scraping systems
- Monitor network conditions: Implement proper logging and monitoring for network operations
- Respect rate limits: Implement intelligent delays and respect server-side rate limiting
Similar to how timeouts are handled in Puppeteer, Swift web scraping requires careful consideration of various timeout scenarios. For complex scraping workflows, consider implementing patterns similar to error handling in Puppeteer which can be adapted to Swift's async/await model.
By implementing these timeout and error handling strategies, you'll create robust Swift web scraping applications that can gracefully handle network failures, server issues, and various edge cases while maintaining optimal performance and reliability.