Proper error handling is critical when using Alamofire for web scraping in Swift. Unlike typical API requests, web scraping involves unpredictable target websites that may return various HTTP status codes, have inconsistent response formats, or implement anti-scraping measures. This guide covers comprehensive error handling strategies tailored for web scraping scenarios.
Core Error Handling Strategies
1. Response Validation and Status Code Handling
For web scraping, you need flexible validation since websites may return different status codes than APIs:
import Alamofire
AF.request("https://example.com/page")
.validate { request, response, data in
// Custom validation for scraping scenarios
switch response.statusCode {
case 200...299:
return .success(Void())
case 429:
// Rate limiting - handle specially
return .failure(ScrapingError.rateLimited)
case 403:
// Access forbidden - may need different approach
return .failure(ScrapingError.accessForbidden)
case 404:
// Page not found - log and continue
return .failure(ScrapingError.pageNotFound)
default:
return .failure(ScrapingError.unexpectedStatusCode(response.statusCode))
}
}
.responseString { response in
switch response.result {
case .success(let html):
// Parse HTML content
self.parseHTML(html)
case .failure(let error):
self.handleScrapingError(error)
}
}
2. Comprehensive AFError Handling
Handle specific Alamofire errors that commonly occur during web scraping:
func handleScrapingError(_ error: Error) {
if let afError = error.asAFError {
switch afError {
case .sessionTaskFailed(let sessionError):
// Network connectivity issues
if let urlError = sessionError as? URLError {
switch urlError.code {
case .timedOut:
print("Request timed out - retry with longer timeout")
retryWithLongerTimeout()
case .notConnectedToInternet:
print("No internet connection")
scheduleRetryWhenOnline()
case .cannotFindHost:
print("Host not found - skip this URL")
markURLAsInvalid()
default:
print("Network error: \(urlError.localizedDescription)")
}
}
case .invalidURL(let url):
print("Invalid URL: \(url)")
logInvalidURL(url)
case .responseValidationFailed(let reason):
switch reason {
case .unacceptableStatusCode(let code):
handleStatusCode(code)
case .unacceptableContentType(let contentType):
print("Unexpected content type: \(contentType)")
default:
print("Validation failed: \(reason)")
}
default:
print("Alamofire error: \(afError)")
}
} else {
print("Non-Alamofire error: \(error)")
}
}
3. Custom Error Types for Web Scraping
Define specific error types for common scraping scenarios:
enum ScrapingError: Error, LocalizedError {
case rateLimited
case accessForbidden
case pageNotFound
case unexpectedStatusCode(Int)
case parsingFailed
case robotsBlocked
case captchaDetected
var errorDescription: String? {
switch self {
case .rateLimited:
return "Rate limit exceeded - slow down requests"
case .accessForbidden:
return "Access forbidden - check user agent or headers"
case .pageNotFound:
return "Page not found"
case .unexpectedStatusCode(let code):
return "Unexpected status code: \(code)"
case .parsingFailed:
return "Failed to parse HTML content"
case .robotsBlocked:
return "Blocked by robots.txt"
case .captchaDetected:
return "CAPTCHA detected"
}
}
}
4. Advanced Retry Logic with Exponential Backoff
Implement intelligent retry logic for web scraping:
class ScrapingRetryPolicy: RequestRetrier {
private let maxRetries: Int
private let baseDelay: TimeInterval
init(maxRetries: Int = 3, baseDelay: TimeInterval = 1.0) {
self.maxRetries = maxRetries
self.baseDelay = baseDelay
}
func retry(_ request: Request, for session: Session, dueTo error: Error, completion: @escaping (RetryResult) -> Void) {
guard request.retryCount < maxRetries else {
completion(.doNotRetry)
return
}
// Check if error is retryable
if let afError = error.asAFError {
switch afError {
case .sessionTaskFailed(let sessionError):
if let urlError = sessionError as? URLError {
switch urlError.code {
case .timedOut, .networkConnectionLost, .notConnectedToInternet:
// Retry with exponential backoff
let delay = baseDelay * pow(2.0, Double(request.retryCount))
completion(.retryWithDelay(delay))
return
default:
break
}
}
case .responseValidationFailed(let reason):
if case .unacceptableStatusCode(let code) = reason, code == 429 {
// Rate limited - wait longer
let delay = baseDelay * pow(3.0, Double(request.retryCount))
completion(.retryWithDelay(delay))
return
}
default:
break
}
}
completion(.doNotRetry)
}
}
// Usage
let session = Session()
session.retrier = ScrapingRetryPolicy(maxRetries: 3, baseDelay: 2.0)
5. Timeout Configuration for Different Scenarios
Configure appropriate timeouts for web scraping:
func createScrapingSession() -> Session {
let configuration = URLSessionConfiguration.default
// Longer timeouts for slow websites
configuration.timeoutIntervalForRequest = 30.0
configuration.timeoutIntervalForResource = 60.0
// Limit concurrent requests to be respectful
configuration.httpMaximumConnectionsPerHost = 2
return Session(configuration: configuration)
}
// Usage with different timeout strategies
func scrapeWithTimeout(url: String, isSlowSite: Bool = false) {
let session = createScrapingSession()
let timeout: TimeInterval = isSlowSite ? 45.0 : 15.0
session.request(url)
.validate()
.responseString(queue: .global(qos: .utility)) { response in
// Handle response on background queue
DispatchQueue.main.async {
self.processScrapingResult(response)
}
}
}
6. Rate Limiting and Request Throttling
Implement rate limiting to avoid being blocked:
class ScrapingManager {
private let requestQueue = DispatchQueue(label: "scraping.queue", qos: .utility)
private let semaphore = DispatchSemaphore(value: 1) // Limit concurrent requests
private var lastRequestTime = Date()
private let minimumDelay: TimeInterval = 1.0 // Minimum delay between requests
func scrapeURL(_ url: String, completion: @escaping (Result<String, Error>) -> Void) {
requestQueue.async {
self.semaphore.wait()
// Ensure minimum delay between requests
let timeSinceLastRequest = Date().timeIntervalSince(self.lastRequestTime)
if timeSinceLastRequest < self.minimumDelay {
Thread.sleep(forTimeInterval: self.minimumDelay - timeSinceLastRequest)
}
self.lastRequestTime = Date()
AF.request(url)
.validate()
.responseString { response in
defer { self.semaphore.signal() }
switch response.result {
case .success(let html):
completion(.success(html))
case .failure(let error):
completion(.failure(error))
}
}
}
}
}
7. Comprehensive Logging and Monitoring
Implement detailed logging for debugging and monitoring:
extension AF {
static func scrapingRequest(_ url: String) -> DataRequest {
return AF.request(url)
.validate()
.cURLDescription { description in
print("cURL: \(description)")
}
.responseString { response in
// Log request details
if let httpResponse = response.response {
print("Status: \(httpResponse.statusCode)")
print("Headers: \(httpResponse.allHeaderFields)")
}
if let error = response.error {
print("Error: \(error)")
// Log to crash analytics or monitoring service
logErrorToMonitoring(error, url: url)
}
}
}
}
func logErrorToMonitoring(_ error: Error, url: String) {
// Example: Send to monitoring service
let errorData = [
"url": url,
"error": error.localizedDescription,
"timestamp": ISO8601DateFormatter().string(from: Date())
]
// Send to your monitoring/analytics service
print("Logged error: \(errorData)")
}
8. Circuit Breaker Pattern for Problematic Domains
Implement a circuit breaker to handle consistently failing domains:
class DomainCircuitBreaker {
private var failureCounts: [String: Int] = [:]
private var blockedDomains: Set<String> = []
private let failureThreshold = 5
private let recoveryTime: TimeInterval = 300 // 5 minutes
func canRequest(domain: String) -> Bool {
return !blockedDomains.contains(domain)
}
func recordFailure(domain: String) {
failureCounts[domain, default: 0] += 1
if failureCounts[domain]! >= failureThreshold {
blockedDomains.insert(domain)
// Schedule recovery
DispatchQueue.global().asyncAfter(deadline: .now() + recoveryTime) {
self.blockedDomains.remove(domain)
self.failureCounts[domain] = 0
}
}
}
func recordSuccess(domain: String) {
failureCounts[domain] = 0
}
}
Web Scraping Ethics and Legal Compliance
When implementing error handling for web scraping, always consider ethical and legal aspects:
- Respect robots.txt: Check and honor robots.txt restrictions
- Handle 403/429 gracefully: Don't attempt to bypass access controls
- Implement reasonable delays: Avoid overwhelming target servers
- Monitor your impact: Track and limit your scraping frequency
- Fail gracefully: Log issues but don't crash or retry aggressively
func checkRobotsCompliance(for url: String) -> Bool {
// Implement robots.txt checking logic
// Return false if scraping is not allowed
return true
}
func respectfulScraping(url: String) {
guard checkRobotsCompliance(for: url) else {
print("Scraping not allowed by robots.txt")
return
}
// Proceed with scraping using the error handling patterns above
}
This comprehensive error handling approach ensures your Alamofire-based web scraping is robust, respectful, and maintainable while handling the unique challenges that web scraping presents.