How do I extract phone numbers or email addresses from HTML using SwiftSoup?
Extracting contact information like phone numbers and email addresses from HTML is a common web scraping task. SwiftSoup, a Swift port of the popular Java library jsoup, provides powerful HTML parsing capabilities that make this process straightforward. This guide will show you how to extract phone numbers and email addresses using both CSS selectors and regular expressions.
Understanding SwiftSoup Basics
SwiftSoup is an HTML parser library for Swift that allows you to navigate, manipulate, and extract data from HTML documents. It provides a jQuery-like API for selecting elements and extracting text content.
First, let's set up SwiftSoup in your Swift project. Add it to your Package.swift
file:
dependencies: [
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.3")
]
Method 1: Using CSS Selectors for Structured HTML
When email addresses and phone numbers are contained within specific HTML elements with consistent attributes, CSS selectors are the most efficient approach.
Extracting Email Addresses with CSS Selectors
import SwiftSoup
func extractEmailsWithSelectors(html: String) throws -> [String] {
let doc = try SwiftSoup.parse(html)
var emails: [String] = []
// Extract from mailto links
let mailtoLinks = try doc.select("a[href^=mailto:]")
for link in mailtoLinks {
let href = try link.attr("href")
let email = String(href.dropFirst(7)) // Remove "mailto:"
emails.append(email)
}
// Extract from elements with email class or data attributes
let emailElements = try doc.select("[class*=email], [data-email]")
for element in emailElements {
let text = try element.text()
if isValidEmail(text) {
emails.append(text)
}
}
return emails
}
func isValidEmail(_ email: String) -> Bool {
let emailRegex = #"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"#
return NSPredicate(format: "SELF MATCHES %@", emailRegex).evaluate(with: email)
}
Extracting Phone Numbers with CSS Selectors
func extractPhonesWithSelectors(html: String) throws -> [String] {
let doc = try SwiftSoup.parse(html)
var phones: [String] = []
// Extract from tel links
let telLinks = try doc.select("a[href^=tel:]")
for link in telLinks {
let href = try link.attr("href")
let phone = String(href.dropFirst(4)) // Remove "tel:"
phones.append(cleanPhoneNumber(phone))
}
// Extract from elements with phone-related classes
let phoneElements = try doc.select("[class*=phone], [class*=tel], [data-phone]")
for element in phoneElements {
let text = try element.text()
if let cleanPhone = extractPhoneFromText(text) {
phones.append(cleanPhone)
}
}
return phones
}
func cleanPhoneNumber(_ phone: String) -> String {
return phone.replacingOccurrences(of: "[^0-9+()-]", with: "", options: .regularExpression)
}
Method 2: Using Regular Expressions for Unstructured Text
When contact information is embedded within plain text without specific HTML structure, regular expressions become essential.
Comprehensive Email Extraction
func extractEmailsWithRegex(html: String) throws -> [String] {
let doc = try SwiftSoup.parse(html)
let textContent = try doc.text()
let emailRegex = #"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"#
let regex = try NSRegularExpression(pattern: emailRegex, options: [])
let matches = regex.matches(in: textContent, options: [], range: NSRange(location: 0, length: textContent.utf16.count))
var emails: [String] = []
for match in matches {
if let range = Range(match.range, in: textContent) {
let email = String(textContent[range])
emails.append(email.lowercased())
}
}
return Array(Set(emails)) // Remove duplicates
}
Advanced Phone Number Extraction
func extractPhonesWithRegex(html: String) throws -> [String] {
let doc = try SwiftSoup.parse(html)
let textContent = try doc.text()
// Multiple phone number patterns
let phonePatterns = [
#"\+?1?[-.\s]?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}"#, // US format
#"\+?[0-9]{1,4}[-.\s]?\(?[0-9]{1,4}\)?[-.\s]?[0-9]{1,4}[-.\s]?[0-9]{1,9}"#, // International
#"\([0-9]{3}\)\s?[0-9]{3}-[0-9]{4}"#, // (555) 123-4567
#"[0-9]{3}-[0-9]{3}-[0-9]{4}"#, // 555-123-4567
#"[0-9]{3}\.[0-9]{3}\.[0-9]{4}"# // 555.123.4567
]
var phones: [String] = []
for pattern in phonePatterns {
let regex = try NSRegularExpression(pattern: pattern, options: [])
let matches = regex.matches(in: textContent, options: [], range: NSRange(location: 0, length: textContent.utf16.count))
for match in matches {
if let range = Range(match.range, in: textContent) {
let phone = String(textContent[range])
let cleanedPhone = cleanPhoneNumber(phone)
if cleanedPhone.count >= 10 {
phones.append(cleanedPhone)
}
}
}
}
return Array(Set(phones)) // Remove duplicates
}
func extractPhoneFromText(_ text: String) -> String? {
let phoneRegex = #"[\+]?[1-9]?[\-\.\s]?\(?[0-9]{3}\)?[\-\.\s]?[0-9]{3}[\-\.\s]?[0-9]{4}"#
let regex = try? NSRegularExpression(pattern: phoneRegex, options: [])
let matches = regex?.matches(in: text, options: [], range: NSRange(location: 0, length: text.utf16.count))
if let match = matches?.first, let range = Range(match.range, in: text) {
return String(text[range])
}
return nil
}
Method 3: Hybrid Approach for Maximum Coverage
For the most comprehensive extraction, combine both methods:
class ContactExtractor {
func extractAllContacts(from html: String) throws -> (emails: [String], phones: [String]) {
var allEmails: Set<String> = Set()
var allPhones: Set<String> = Set()
// Method 1: CSS Selectors
let selectorEmails = try extractEmailsWithSelectors(html: html)
let selectorPhones = try extractPhonesWithSelectors(html: html)
allEmails.formUnion(selectorEmails)
allPhones.formUnion(selectorPhones)
// Method 2: Regular Expressions
let regexEmails = try extractEmailsWithRegex(html: html)
let regexPhones = try extractPhonesWithRegex(html: html)
allEmails.formUnion(regexEmails)
allPhones.formUnion(regexPhones)
return (Array(allEmails), Array(allPhones))
}
}
Handling Special Cases and Edge Scenarios
Dealing with Obfuscated Contact Information
Some websites obfuscate contact information to prevent automated extraction:
func deobfuscateContacts(html: String) throws -> String {
let doc = try SwiftSoup.parse(html)
// Handle common obfuscation patterns
let obfuscatedElements = try doc.select("[data-email], [data-phone]")
for element in obfuscatedElements {
let dataEmail = try element.attr("data-email")
let dataPhone = try element.attr("data-phone")
if !dataEmail.isEmpty {
// Decode base64 or other encoding
if let decodedData = Data(base64Encoded: dataEmail),
let decodedEmail = String(data: decodedData, encoding: .utf8) {
try element.text(decodedEmail)
}
}
if !dataPhone.isEmpty {
// Handle encoded phone numbers
let decodedPhone = dataPhone.replacingOccurrences(of: "[AT]", with: "@")
try element.text(decodedPhone)
}
}
return try doc.outerHtml()
}
Validating Extracted Information
Always validate extracted contact information:
func validateAndFilterContacts(emails: [String], phones: [String]) -> (validEmails: [String], validPhones: [String]) {
let validEmails = emails.filter { email in
let emailRegex = #"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"#
return NSPredicate(format: "SELF MATCHES %@", emailRegex).evaluate(with: email)
}
let validPhones = phones.filter { phone in
let digitsOnly = phone.components(separatedBy: CharacterSet.decimalDigits.inverted).joined()
return digitsOnly.count >= 10 && digitsOnly.count <= 15
}
return (validEmails, validPhones)
}
Performance Optimization Tips
When processing large HTML documents or multiple pages, consider these optimization strategies:
1. Limit Search Scope
func optimizedExtraction(html: String) throws -> (emails: [String], phones: [String]) {
let doc = try SwiftSoup.parse(html)
// Focus on likely containers
let contactSections = try doc.select("footer, .contact, .about, .staff, header")
var emails: [String] = []
var phones: [String] = []
for section in contactSections {
let sectionHtml = try section.outerHtml()
let sectionEmails = try extractEmailsWithRegex(html: sectionHtml)
let sectionPhones = try extractPhonesWithRegex(html: sectionHtml)
emails.append(contentsOf: sectionEmails)
phones.append(contentsOf: sectionPhones)
}
return (Array(Set(emails)), Array(Set(phones)))
}
2. Compile Regular Expressions Once
class OptimizedContactExtractor {
private let emailRegex: NSRegularExpression
private let phoneRegex: NSRegularExpression
init() throws {
emailRegex = try NSRegularExpression(pattern: #"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"#)
phoneRegex = try NSRegularExpression(pattern: #"\+?1?[-.\s]?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}"#)
}
func extractContacts(from html: String) throws -> (emails: [String], phones: [String]) {
let doc = try SwiftSoup.parse(html)
let text = try doc.text()
let emailMatches = emailRegex.matches(in: text, options: [], range: NSRange(location: 0, length: text.utf16.count))
let phoneMatches = phoneRegex.matches(in: text, options: [], range: NSRange(location: 0, length: text.utf16.count))
let emails = emailMatches.compactMap { match in
Range(match.range, in: text).map { String(text[$0]) }
}
let phones = phoneMatches.compactMap { match in
Range(match.range, in: text).map { String(text[$0]) }
}
return (emails, phones)
}
}
Integration with Web Scraping Workflows
When building comprehensive web scraping solutions, you might need to handle dynamic content that loads after page load or navigate to different pages before extracting contact information. SwiftSoup works excellently as a post-processing step after content has been fully loaded.
Complete Example Usage
import SwiftSoup
import Foundation
// Complete example combining all methods
func main() {
let html = """
<html>
<body>
<div class="contact-info">
<p>Email us at <a href="mailto:support@example.com">support@example.com</a></p>
<p>Call us at <a href="tel:+1-555-123-4567">+1 (555) 123-4567</a></p>
<div class="staff">
<p>John Doe: john.doe@company.com, (555) 987-6543</p>
<p>Jane Smith: jane.smith@company.com, 555.123.7890</p>
</div>
</div>
</body>
</html>
"""
do {
let extractor = try OptimizedContactExtractor()
let (emails, phones) = try extractor.extractContacts(from: html)
print("Extracted Emails:")
emails.forEach { print(" \($0)") }
print("\nExtracted Phones:")
phones.forEach { print(" \($0)") }
let (validEmails, validPhones) = validateAndFilterContacts(emails: emails, phones: phones)
print("\nValid Emails: \(validEmails.count)")
print("Valid Phones: \(validPhones.count)")
} catch {
print("Error: \(error)")
}
}
Conclusion
SwiftSoup provides powerful capabilities for extracting phone numbers and email addresses from HTML content. By combining CSS selectors for structured data with regular expressions for unstructured text, you can achieve comprehensive contact information extraction. Remember to validate your results and consider performance optimizations when processing large volumes of data.
The key to successful contact extraction is understanding your target content structure and choosing the appropriate method—CSS selectors for consistent HTML patterns and regular expressions for free-form text extraction.