HTTP redirects are common in web scraping scenarios and require careful handling to ensure reliable data extraction. Swift's URLSession
automatically follows redirects by default, but manual control provides better insight into the redirect chain and allows for custom handling logic.
Understanding HTTP Redirects
Redirects occur when: - URLs have permanently moved (301 redirect) - Content is temporarily moved (302 redirect) - Resources are accessed via HTTPS instead of HTTP - Servers implement load balancing or CDN routing
Automatic vs Manual Redirect Handling
Automatic Redirects (Default Behavior)
import Foundation
// URLSession follows redirects automatically
let url = URL(string: "http://example.com")!
let task = URLSession.shared.dataTask(with: url) { data, response, error in
if let httpResponse = response as? HTTPURLResponse {
print("Final URL: \(httpResponse.url?.absoluteString ?? "Unknown")")
print("Status: \(httpResponse.statusCode)")
}
}
task.resume()
Manual Redirect Handling
For granular control, implement URLSessionTaskDelegate
:
import Foundation
class WebScrapingRedirectHandler: NSObject, URLSessionTaskDelegate {
private var redirectChain: [URL] = []
lazy var session: URLSession = {
let configuration = URLSessionConfiguration.default
configuration.timeoutIntervalForRequest = 30
return URLSession(configuration: configuration, delegate: self, delegateQueue: nil)
}()
func scrapeWebsite(from url: URL, completion: @escaping (String?, [URL], Error?) -> Void) {
redirectChain = [url] // Track the original URL
let task = session.dataTask(with: url) { data, response, error in
if let error = error {
completion(nil, self.redirectChain, error)
return
}
guard let data = data else {
completion(nil, self.redirectChain, NSError(domain: "NoData", code: 0))
return
}
let html = String(data: data, encoding: .utf8)
completion(html, self.redirectChain, nil)
}
task.resume()
}
// MARK: - URLSessionTaskDelegate
func urlSession(_ session: URLSession,
task: URLSessionTask,
willPerformHTTPRedirection response: HTTPURLResponse,
newRequest request: URLRequest,
completionHandler: @escaping (URLRequest?) -> Void) {
guard let redirectURL = request.url else {
completionHandler(nil)
return
}
// Add to redirect chain
redirectChain.append(redirectURL)
print("Redirect \(redirectChain.count - 1): \(response.statusCode) -> \(redirectURL)")
// Check redirect limits
if redirectChain.count > 10 {
print("Too many redirects, stopping at: \(redirectURL)")
completionHandler(nil)
return
}
// Inspect redirect type and decide whether to follow
switch response.statusCode {
case 301, 302, 303, 307, 308:
// Follow the redirect with potentially modified request
var modifiedRequest = request
// Add custom headers if needed
modifiedRequest.setValue("Mozilla/5.0 (compatible; Swift Web Scraper)",
forHTTPHeaderField: "User-Agent")
// Preserve cookies from previous requests
if let cookies = HTTPCookieStorage.shared.cookies(for: redirectURL) {
let cookieHeader = HTTPCookie.requestHeaderFields(with: cookies)
for (key, value) in cookieHeader {
modifiedRequest.setValue(value, forHTTPHeaderField: key)
}
}
completionHandler(modifiedRequest)
default:
// Don't follow unexpected redirects
print("Unexpected redirect status: \(response.statusCode)")
completionHandler(nil)
}
}
}
Advanced Redirect Handling with Cookie Management
class AdvancedWebScraper: NSObject, URLSessionTaskDelegate {
private var cookieStorage = HTTPCookieStorage()
private var redirectHistory: [(from: URL, to: URL, statusCode: Int)] = []
lazy var session: URLSession = {
let config = URLSessionConfiguration.default
config.httpCookieStorage = cookieStorage
config.httpCookieAcceptPolicy = .always
return URLSession(configuration: config, delegate: self, delegateQueue: nil)
}()
func scrapeWithCookieTracking(url: URL, completion: @escaping (Result<String, Error>) -> Void) {
let task = session.dataTask(with: url) { data, response, error in
if let error = error {
completion(.failure(error))
return
}
guard let data = data,
let html = String(data: data, encoding: .utf8) else {
completion(.failure(NSError(domain: "InvalidData", code: 0)))
return
}
completion(.success(html))
}
task.resume()
}
func urlSession(_ session: URLSession,
task: URLSessionTask,
willPerformHTTPRedirection response: HTTPURLResponse,
newRequest request: URLRequest,
completionHandler: @escaping (URLRequest?) -> Void) {
guard let originalURL = task.originalRequest?.url,
let redirectURL = request.url else {
completionHandler(request)
return
}
// Log redirect
redirectHistory.append((from: originalURL, to: redirectURL, statusCode: response.statusCode))
// Handle different redirect scenarios
switch response.statusCode {
case 301: // Permanent redirect
print("Permanent redirect: \(originalURL) -> \(redirectURL)")
case 302, 303: // Temporary redirect
print("Temporary redirect: \(originalURL) -> \(redirectURL)")
case 307, 308: // Method-preserving redirects
print("Method-preserving redirect: \(originalURL) -> \(redirectURL)")
default:
print("Unknown redirect type: \(response.statusCode)")
}
// Extract and preserve cookies from redirect response
if let headerFields = response.allHeaderFields as? [String: String] {
let cookies = HTTPCookie.cookies(withResponseHeaderFields: headerFields, for: redirectURL)
for cookie in cookies {
cookieStorage.setCookie(cookie)
}
}
completionHandler(request)
}
func printRedirectHistory() {
print("\n--- Redirect History ---")
for (index, redirect) in redirectHistory.enumerated() {
print("\(index + 1). [\(redirect.statusCode)] \(redirect.from) -> \(redirect.to)")
}
}
}
Practical Usage Examples
Basic Usage with Error Handling
let scraper = WebScrapingRedirectHandler()
let url = URL(string: "https://httpbin.org/redirect/3")!
scraper.scrapeWebsite(from: url) { html, redirectChain, error in
if let error = error {
print("Scraping failed: \(error)")
} else {
print("Successfully scraped after \(redirectChain.count - 1) redirects")
print("Redirect chain: \(redirectChain.map { $0.absoluteString })")
print("Final content length: \(html?.count ?? 0) characters")
}
}
Handling Redirects with Custom Logic
func conditionalRedirectHandler(maxRedirects: Int = 5) -> AdvancedWebScraper {
let scraper = AdvancedWebScraper()
// Override redirect handling with custom logic
return scraper
}
let customScraper = conditionalRedirectHandler(maxRedirects: 3)
let testURL = URL(string: "https://example.com/some-page")!
customScraper.scrapeWithCookieTracking(url: testURL) { result in
switch result {
case .success(let html):
print("Scraped successfully: \(html.prefix(100))...")
customScraper.printRedirectHistory()
case .failure(let error):
print("Scraping failed: \(error)")
}
}
Best Practices
- Limit Redirects: Prevent infinite redirect loops by setting maximum redirect counts
- Preserve Headers: Maintain important headers like User-Agent and authentication tokens
- Handle Cookies: Properly manage cookies across redirects for session-based sites
- Log Redirect Chains: Track redirect paths for debugging and analysis
- Respect Rate Limits: Add delays between requests when following multiple redirects
- Error Handling: Implement robust error handling for network timeouts and invalid responses
Common Redirect Scenarios in Web Scraping
- HTTP to HTTPS: Many sites automatically redirect to secure versions
- Mobile Redirects: Sites may redirect based on User-Agent headers
- Geographic Redirects: Content delivery networks may redirect based on location
- Login Redirects: Authentication systems often use redirects after login
Remember to always respect website terms of service and implement appropriate rate limiting when performing web scraping operations.