How do I extract image URLs from HTML using SwiftSoup?
SwiftSoup is a powerful HTML parsing library for Swift that provides an easy way to extract image URLs from HTML documents. This guide covers various techniques for extracting image URLs, handling different image formats, and processing complex HTML structures.
What is SwiftSoup?
SwiftSoup is a Swift port of the popular Java library jsoup, designed for parsing and manipulating HTML documents. It provides a jQuery-like API that makes it simple to select and extract data from HTML elements, including image URLs from <img>
tags and other sources.
Basic Image URL Extraction
Simple Image URL Extraction
Here's how to extract all image URLs from an HTML document:
import SwiftSoup
func extractImageURLs(from html: String) -> [String] {
do {
let doc = try SwiftSoup.parse(html)
let images = try doc.select("img")
var imageURLs: [String] = []
for image in images {
if let src = try? image.attr("src") {
imageURLs.append(src)
}
}
return imageURLs
} catch {
print("Error parsing HTML: \(error)")
return []
}
}
// Example usage
let htmlContent = """
<html>
<body>
<img src="https://example.com/image1.jpg" alt="Image 1">
<img src="/static/image2.png" alt="Image 2">
<img src="..." alt="Base64 Image">
</body>
</html>
"""
let urls = extractImageURLs(from: htmlContent)
print("Found image URLs: \(urls)")
Extracting Images with Attributes
To get more detailed information about each image:
struct ImageInfo {
let src: String
let alt: String?
let title: String?
let width: String?
let height: String?
}
func extractImageInfo(from html: String) -> [ImageInfo] {
do {
let doc = try SwiftSoup.parse(html)
let images = try doc.select("img")
var imageInfos: [ImageInfo] = []
for image in images {
let src = try image.attr("src")
let alt = try? image.attr("alt")
let title = try? image.attr("title")
let width = try? image.attr("width")
let height = try? image.attr("height")
let info = ImageInfo(
src: src,
alt: alt?.isEmpty == false ? alt : nil,
title: title?.isEmpty == false ? title : nil,
width: width?.isEmpty == false ? width : nil,
height: height?.isEmpty == false ? height : nil
)
imageInfos.append(info)
}
return imageInfos
} catch {
print("Error parsing HTML: \(error)")
return []
}
}
Advanced Image URL Extraction Techniques
Handling Lazy Loading Images
Many modern websites use lazy loading with data attributes:
func extractLazyLoadedImages(from html: String) -> [String] {
do {
let doc = try SwiftSoup.parse(html)
var imageURLs: [String] = []
// Standard img tags
let standardImages = try doc.select("img[src]")
for image in standardImages {
if let src = try? image.attr("src") {
imageURLs.append(src)
}
}
// Lazy loaded images with common data attributes
let lazyAttributes = ["data-src", "data-lazy-src", "data-original", "data-srcset"]
for attribute in lazyAttributes {
let lazyImages = try doc.select("img[\(attribute)]")
for image in lazyImages {
if let src = try? image.attr(attribute) {
imageURLs.append(src)
}
}
}
return imageURLs
} catch {
print("Error parsing HTML: \(error)")
return []
}
}
Extracting from CSS Background Images
Some images are defined in CSS background-image properties:
func extractBackgroundImages(from html: String) -> [String] {
do {
let doc = try SwiftSoup.parse(html)
var imageURLs: [String] = []
// Find elements with style attributes
let elementsWithStyle = try doc.select("[style*='background-image']")
for element in elementsWithStyle {
if let style = try? element.attr("style") {
// Extract URL from background-image: url('...')
let pattern = #"background-image:\s*url\(['"]?([^'")]+)['"]?\)"#
if let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
let matches = regex.matches(in: style, options: [], range: NSRange(location: 0, length: style.count))
for match in matches {
if match.numberOfRanges > 1 {
let range = match.range(at: 1)
if let swiftRange = Range(range, in: style) {
let url = String(style[swiftRange])
imageURLs.append(url)
}
}
}
}
}
}
return imageURLs
} catch {
print("Error parsing HTML: \(error)")
return []
}
}
Handling Responsive Images (srcset)
Modern websites often use the srcset
attribute for responsive images:
func extractResponsiveImages(from html: String) -> [String] {
do {
let doc = try SwiftSoup.parse(html)
var imageURLs: [String] = []
let images = try doc.select("img")
for image in images {
// Extract from src attribute
if let src = try? image.attr("src") {
imageURLs.append(src)
}
// Extract from srcset attribute
if let srcset = try? image.attr("srcset") {
let urls = parseSrcset(srcset)
imageURLs.append(contentsOf: urls)
}
}
return imageURLs
} catch {
print("Error parsing HTML: \(error)")
return []
}
}
func parseSrcset(_ srcset: String) -> [String] {
let sources = srcset.components(separatedBy: ",")
var urls: [String] = []
for source in sources {
let trimmed = source.trimmingCharacters(in: .whitespaces)
let components = trimmed.components(separatedBy: " ")
if let url = components.first {
urls.append(url)
}
}
return urls
}
Filtering and Processing Image URLs
Filter by Image Type
func extractImagesByType(from html: String, types: [String] = ["jpg", "jpeg", "png", "gif", "webp", "svg"]) -> [String] {
let allImages = extractImageURLs(from: html)
return allImages.filter { url in
let lowercaseURL = url.lowercased()
return types.contains { type in
lowercaseURL.contains(".\(type)") || lowercaseURL.contains(".\(type)?")
}
}
}
Convert Relative URLs to Absolute
func convertToAbsoluteURLs(_ urls: [String], baseURL: String) -> [String] {
guard let base = URL(string: baseURL) else { return urls }
return urls.compactMap { urlString in
if urlString.hasPrefix("http") || urlString.hasPrefix("data:") {
// Already absolute or data URL
return urlString
} else if urlString.hasPrefix("//") {
// Protocol-relative URL
return "https:\(urlString)"
} else {
// Relative URL
return URL(string: urlString, relativeTo: base)?.absoluteString
}
}
}
Exclude Unwanted Images
func filterValidImages(from urls: [String]) -> [String] {
return urls.filter { url in
// Exclude data URLs, tracking pixels, and small images
return !url.hasPrefix("data:") &&
!url.contains("1x1") &&
!url.contains("tracking") &&
!url.contains("pixel") &&
!url.contains("beacon") &&
url.count > 10 // Minimum URL length
}
}
Complete Example: Image Scraper Class
Here's a complete implementation that combines all the techniques:
import SwiftSoup
import Foundation
class ImageScraper {
private let baseURL: String?
init(baseURL: String? = nil) {
self.baseURL = baseURL
}
func extractAllImages(from html: String) -> [ImageInfo] {
do {
let doc = try SwiftSoup.parse(html)
var images: [ImageInfo] = []
// Extract from img tags
images.append(contentsOf: extractFromImgTags(doc))
// Extract lazy loaded images
images.append(contentsOf: extractLazyImages(doc))
// Extract background images
images.append(contentsOf: extractCSSBackgroundImages(doc))
// Convert relative URLs to absolute
if let baseURL = baseURL {
images = images.map { image in
var updatedImage = image
if let absoluteURL = convertToAbsoluteURL(image.src, baseURL: baseURL) {
updatedImage.src = absoluteURL
}
return updatedImage
}
}
return removeDuplicates(images)
} catch {
print("Error parsing HTML: \(error)")
return []
}
}
private func extractFromImgTags(_ doc: Document) -> [ImageInfo] {
var images: [ImageInfo] = []
do {
let imgTags = try doc.select("img")
for img in imgTags {
if let src = try? img.attr("src"), !src.isEmpty {
let info = ImageInfo(
src: src,
alt: try? img.attr("alt"),
title: try? img.attr("title"),
width: try? img.attr("width"),
height: try? img.attr("height")
)
images.append(info)
}
// Handle srcset
if let srcset = try? img.attr("srcset"), !srcset.isEmpty {
let srcsetURLs = parseSrcset(srcset)
for url in srcsetURLs {
let info = ImageInfo(
src: url,
alt: try? img.attr("alt"),
title: try? img.attr("title"),
width: try? img.attr("width"),
height: try? img.attr("height")
)
images.append(info)
}
}
}
} catch {
print("Error extracting img tags: \(error)")
}
return images
}
private func extractLazyImages(_ doc: Document) -> [ImageInfo] {
var images: [ImageInfo] = []
let lazyAttributes = ["data-src", "data-lazy-src", "data-original"]
for attribute in lazyAttributes {
do {
let lazyImages = try doc.select("img[\(attribute)]")
for img in lazyImages {
if let src = try? img.attr(attribute), !src.isEmpty {
let info = ImageInfo(
src: src,
alt: try? img.attr("alt"),
title: try? img.attr("title"),
width: try? img.attr("width"),
height: try? img.attr("height")
)
images.append(info)
}
}
} catch {
print("Error extracting lazy images: \(error)")
}
}
return images
}
private func extractCSSBackgroundImages(_ doc: Document) -> [ImageInfo] {
var images: [ImageInfo] = []
do {
let elementsWithStyle = try doc.select("[style*='background-image']")
for element in elementsWithStyle {
if let style = try? element.attr("style") {
let urls = extractURLsFromCSS(style)
for url in urls {
let info = ImageInfo(src: url, alt: nil, title: nil, width: nil, height: nil)
images.append(info)
}
}
}
} catch {
print("Error extracting background images: \(error)")
}
return images
}
private func extractURLsFromCSS(_ css: String) -> [String] {
let pattern = #"background-image:\s*url\(['"]?([^'")]+)['"]?\)"#
var urls: [String] = []
do {
let regex = try NSRegularExpression(pattern: pattern, options: .caseInsensitive)
let matches = regex.matches(in: css, options: [], range: NSRange(location: 0, length: css.count))
for match in matches {
if match.numberOfRanges > 1 {
let range = match.range(at: 1)
if let swiftRange = Range(range, in: css) {
let url = String(css[swiftRange])
urls.append(url)
}
}
}
} catch {
print("Error extracting URLs from CSS: \(error)")
}
return urls
}
private func convertToAbsoluteURL(_ urlString: String, baseURL: String) -> String? {
if urlString.hasPrefix("http") || urlString.hasPrefix("data:") {
return urlString
} else if urlString.hasPrefix("//") {
return "https:\(urlString)"
} else if let base = URL(string: baseURL) {
return URL(string: urlString, relativeTo: base)?.absoluteString
}
return nil
}
private func removeDuplicates(_ images: [ImageInfo]) -> [ImageInfo] {
var seen = Set<String>()
return images.filter { image in
if seen.contains(image.src) {
return false
} else {
seen.insert(image.src)
return true
}
}
}
}
// Usage example
let scraper = ImageScraper(baseURL: "https://example.com")
let images = scraper.extractAllImages(from: htmlContent)
for image in images {
print("URL: \(image.src)")
if let alt = image.alt {
print("Alt: \(alt)")
}
}
Error Handling and Best Practices
Robust Error Handling
enum ImageExtractionError: Error {
case invalidHTML
case parsingFailed(String)
case noImagesFound
}
func safeExtractImages(from html: String) -> Result<[ImageInfo], ImageExtractionError> {
guard !html.isEmpty else {
return .failure(.invalidHTML)
}
do {
let doc = try SwiftSoup.parse(html)
let images = try doc.select("img")
if images.isEmpty() {
return .failure(.noImagesFound)
}
var imageInfos: [ImageInfo] = []
for image in images {
do {
let src = try image.attr("src")
if !src.isEmpty {
let info = ImageInfo(
src: src,
alt: try? image.attr("alt"),
title: try? image.attr("title"),
width: try? image.attr("width"),
height: try? image.attr("height")
)
imageInfos.append(info)
}
} catch {
// Skip invalid images but continue processing
continue
}
}
return .success(imageInfos)
} catch {
return .failure(.parsingFailed(error.localizedDescription))
}
}
Integration with Web Scraping Workflows
When building comprehensive web scraping solutions, image extraction often works alongside handling dynamic content that loads after page load and managing browser sessions for more complex scraping scenarios.
For iOS applications that need to scrape images from web pages, SwiftSoup provides an excellent foundation that can be combined with networking libraries like URLSession for downloading the actual image data once URLs are extracted.
Conclusion
SwiftSoup provides a powerful and flexible way to extract image URLs from HTML documents in Swift applications. Whether you're building a simple image scraper or a complex web crawling system, the techniques covered in this guide will help you handle various image formats, lazy loading scenarios, and CSS background images effectively.
Remember to always respect website terms of service and implement appropriate rate limiting when scraping images from web sources. Consider caching extracted URLs to avoid repeated processing of the same content, and implement proper error handling to make your image extraction robust and reliable.