Can I use SwiftSoup to modify HTML content before extracting data?

Yes, SwiftSoup provides comprehensive HTML modification capabilities that allow you to manipulate HTML content before extracting data. This powerful feature is particularly useful for cleaning up malformed HTML, removing unwanted elements, adding missing attributes, or restructuring content to make data extraction more reliable and efficient.

Understanding SwiftSoup's Modification Capabilities

SwiftSoup is a pure Swift HTML parser that provides a jQuery-like API for HTML manipulation. Unlike read-only parsers, SwiftSoup allows you to modify the DOM structure, element attributes, and text content programmatically. This makes it an excellent choice for preprocessing HTML data before extraction.

Basic HTML Modification Operations

Modifying Element Text and HTML Content

You can easily modify the text content or inner HTML of elements:

import SwiftSoup

do {
    let html = """
    <div class="content">
        <h1>Old Title</h1>
        <p>Original paragraph text</p>
    </div>
    """

    let doc = try SwiftSoup.parse(html)

    // Modify text content
    try doc.select("h1").first()?.text("New Title")
    try doc.select("p").first()?.text("Updated paragraph text")

    // Modify HTML content
    try doc.select(".content").first()?.html("<h2>Completely New Content</h2><p>Fresh paragraph</p>")

    print(try doc.html())
} catch {
    print("Error: \(error)")
}

Adding and Modifying Attributes

SwiftSoup allows you to add, modify, or remove attributes from HTML elements:

do {
    let html = "<img src='old-image.jpg' alt='Old Alt Text'><a href='#'>Link</a>"
    let doc = try SwiftSoup.parse(html)

    // Modify existing attributes
    try doc.select("img").first()?.attr("src", "new-image.jpg")
    try doc.select("img").first()?.attr("alt", "New Alt Text")

    // Add new attributes
    try doc.select("img").first()?.attr("class", "responsive-image")
    try doc.select("a").first()?.attr("target", "_blank")
    try doc.select("a").first()?.attr("rel", "noopener")

    // Remove attributes
    try doc.select("img").first()?.removeAttr("alt")

    print(try doc.html())
} catch {
    print("Error: \(error)")
}

Advanced HTML Structure Manipulation

Adding New Elements

You can create and insert new elements into the DOM:

do {
    let html = "<div id='container'><p>Existing content</p></div>"
    let doc = try SwiftSoup.parse(html)

    // Create new elements
    let newHeading = try doc.createElement("h2")
    try newHeading.text("New Heading")
    try newHeading.attr("class", "dynamic-heading")

    let newParagraph = try doc.createElement("p")
    try newParagraph.html("This is a <strong>new paragraph</strong> with formatting")

    // Insert elements
    let container = try doc.select("#container").first()
    try container?.prepend(newHeading.outerHtml())
    try container?.append(newParagraph.outerHtml())

    print(try doc.html())
} catch {
    print("Error: \(error)")
}

Removing Unwanted Elements

Remove elements that might interfere with data extraction:

do {
    let html = """
    <article>
        <h1>Article Title</h1>
        <script>console.log('unwanted script');</script>
        <p>Main content here</p>
        <div class="ads">Advertisement</div>
        <p>More content</p>
        <style>.hidden { display: none; }</style>
    </article>
    """

    let doc = try SwiftSoup.parse(html)

    // Remove unwanted elements
    try doc.select("script").remove()
    try doc.select("style").remove()
    try doc.select(".ads").remove()

    // Remove elements by attribute
    try doc.select("[style*='display:none']").remove()

    print(try doc.html())
} catch {
    print("Error: \(error)")
}

Practical Data Extraction Preprocessing

Cleaning and Normalizing Data

Before extracting data, you might want to clean and normalize the HTML structure:

func preprocessHTMLForExtraction(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Normalize whitespace in text content
    let textElements = try doc.select("p, h1, h2, h3, h4, h5, h6, span, div")
    for element in textElements {
        let cleanText = try element.text().trimmingCharacters(in: .whitespacesAndNewlines)
        if !cleanText.isEmpty {
            try element.text(cleanText)
        }
    }

    // Convert relative URLs to absolute URLs
    let baseUrl = "https://example.com"
    let links = try doc.select("a[href]")
    for link in links {
        let href = try link.attr("href")
        if href.hasPrefix("/") {
            try link.attr("href", baseUrl + href)
        }
    }

    // Add missing alt attributes to images
    let images = try doc.select("img:not([alt])")
    for img in images {
        let src = try img.attr("src")
        let filename = URL(string: src)?.lastPathComponent ?? "image"
        try img.attr("alt", "Image: \(filename)")
    }

    // Remove empty paragraphs
    let emptyPs = try doc.select("p:empty")
    try emptyPs.remove()

    return doc
}

// Usage
do {
    let rawHTML = "<html><body><p>  Content  </p><img src='/image.jpg'><p></p></body></html>"
    let cleanDoc = try preprocessHTMLForExtraction(rawHTML)

    // Now extract data from cleaned document
    let content = try cleanDoc.select("p").text()
    let imageAlt = try cleanDoc.select("img").first()?.attr("alt") ?? ""

    print("Content: \(content)")
    print("Image Alt: \(imageAlt)")
} catch {
    print("Error: \(error)")
}

Restructuring HTML for Better Data Access

Sometimes you need to restructure HTML to make data extraction more straightforward:

func restructureProductHTML(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Move scattered product information into a structured container
    let productContainer = try doc.createElement("div")
    try productContainer.attr("class", "product-structured")

    // Collect product name
    if let productName = try doc.select(".product-title, h1.title, .name").first() {
        let nameElement = try doc.createElement("div")
        try nameElement.attr("class", "product-name")
        try nameElement.text(try productName.text())
        try productContainer.appendChild(nameElement)
    }

    // Collect price information
    let priceSelectors = [".price", ".cost", "[data-price]"]
    for selector in priceSelectors {
        if let priceElement = try doc.select(selector).first() {
            let structuredPrice = try doc.createElement("div")
            try structuredPrice.attr("class", "product-price")
            try structuredPrice.text(try priceElement.text())
            try productContainer.appendChild(structuredPrice)
            break
        }
    }

    // Collect description
    if let description = try doc.select(".description, .product-desc, p.desc").first() {
        let descElement = try doc.createElement("div")
        try descElement.attr("class", "product-description")
        try descElement.text(try description.text())
        try productContainer.appendChild(descElement)
    }

    // Insert structured container at the beginning of body
    try doc.body()?.prepend(productContainer.outerHtml())

    return doc
}

Working with Complex HTML Modifications

Conditional Modifications Based on Content

You can apply modifications conditionally based on the existing content:

func conditionallyModifyHTML(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Add warning class to external links
    let externalLinks = try doc.select("a[href]").filter { element in
        let href = try element.attr("href")
        return href.hasPrefix("http") && !href.contains("yourdomain.com")
    }

    for link in externalLinks {
        let currentClass = try link.attr("class")
        let newClass = currentClass.isEmpty ? "external-link" : "\(currentClass) external-link"
        try link.attr("class", newClass)
        try link.attr("rel", "noopener nofollow")
    }

    // Add table wrapper for responsive tables
    let tables = try doc.select("table")
    for table in tables {
        let wrapper = try doc.createElement("div")
        try wrapper.attr("class", "table-responsive")

        // Wrap the table
        try table.wrap(wrapper.outerHtml())
    }

    // Convert headings to consistent structure
    let headings = try doc.select("h1, h2, h3, h4, h5, h6")
    for heading in headings {
        let level = heading.tagName().suffix(1)
        try heading.attr("data-level", String(level))

        // Add anchor links for navigation
        let id = try heading.attr("id")
        if id.isEmpty {
            let generatedId = try heading.text().lowercased()
                .replacingOccurrences(of: " ", with: "-")
                .replacingOccurrences(of: "[^a-z0-9-]", with: "", options: .regularExpression)
            try heading.attr("id", generatedId)
        }
    }

    return doc
}

Integration with Data Extraction Workflows

When working with complex web scraping scenarios, you might need to handle dynamic content loading. For applications requiring JavaScript execution before HTML modification, consider combining SwiftSoup with tools that can handle dynamic content that loads after page load, though SwiftSoup itself works with static HTML.

Chaining Modifications and Extractions

class HTMLProcessor {
    private let document: Document

    init(html: String) throws {
        self.document = try SwiftSoup.parse(html)
    }

    func removeElements(_ selector: String) throws -> HTMLProcessor {
        try document.select(selector).remove()
        return self
    }

    func addAttribute(_ selector: String, attribute: String, value: String) throws -> HTMLProcessor {
        let elements = try document.select(selector)
        for element in elements {
            try element.attr(attribute, value)
        }
        return self
    }

    func normalizeText(_ selector: String) throws -> HTMLProcessor {
        let elements = try document.select(selector)
        for element in elements {
            let normalized = try element.text()
                .trimmingCharacters(in: .whitespacesAndNewlines)
                .replacingOccurrences(of: "\\s+", with: " ", options: .regularExpression)
            try element.text(normalized)
        }
        return self
    }

    func extract(_ selector: String) throws -> [String] {
        return try document.select(selector).map { try $0.text() }
    }

    func extractAttribute(_ selector: String, attribute: String) throws -> [String] {
        return try document.select(selector).map { try $0.attr(attribute) }
    }

    func getDocument() -> Document {
        return document
    }
}

// Usage example
do {
    let rawHTML = """
    <html><body>
        <div class="ads">Advertisement</div>
        <h1>  Product Title  </h1>
        <p class="price">$99.99</p>
        <script>tracking();</script>
        <p class="description">   Product description   </p>
    </body></html>
    """

    let processor = try HTMLProcessor(html: rawHTML)

    let productTitles = try processor
        .removeElements("script, .ads")
        .normalizeText("h1, p")
        .addAttribute("p", attribute: "data-processed", value: "true")
        .extract("h1")

    let prices = try processor.extract(".price")
    let descriptions = try processor.extract(".description")

    print("Titles: \(productTitles)")
    print("Prices: \(prices)")
    print("Descriptions: \(descriptions)")

} catch {
    print("Error: \(error)")
}

Best Practices for HTML Modification

Performance Considerations

When modifying large HTML documents, consider these performance optimization techniques:

func optimizedHTMLModification(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Batch operations when possible
    let elementsToRemove = try doc.select("script, style, .ad, .popup")
    try elementsToRemove.remove()

    // Use specific selectors to avoid unnecessary traversals
    let priceElements = try doc.select("[data-price], .price-value")
    for element in priceElements {
        // Process price formatting
        let priceText = try element.text()
        let cleanPrice = priceText.replacingOccurrences(of: "[^0-9.]", with: "", options: .regularExpression)
        try element.attr("data-clean-price", cleanPrice)
    }

    return doc
}

Error Handling and Validation

Always implement robust error handling when modifying HTML:

func safeHTMLModification(_ html: String) -> Document? {
    do {
        let doc = try SwiftSoup.parse(html)

        // Validate document structure before modification
        guard try doc.body() != nil else {
            print("Invalid HTML: No body element found")
            return nil
        }

        // Safely modify elements
        let targetElements = try doc.select(".target-class")
        guard !targetElements.isEmpty() else {
            print("Warning: No target elements found")
            return doc
        }

        for element in targetElements {
            try element.attr("data-modified", "true")
        }

        return doc

    } catch let error as Exception {
        print("SwiftSoup error: \(error.getMessage())")
        return nil
    } catch {
        print("Unexpected error: \(error)")
        return nil
    }
}

Working with Forms and Interactive Elements

Modifying Form Fields and Input Elements

SwiftSoup can modify form elements to prepare them for data extraction or automated submission:

func preprocessFormElements(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Add default values to empty form fields
    let emptyInputs = try doc.select("input:not([value])")
    for input in emptyInputs {
        let inputType = try input.attr("type")
        switch inputType {
        case "text", "":
            try input.attr("value", "")
        case "checkbox":
            try input.attr("checked", "false")
        case "radio":
            try input.attr("checked", "false")
        default:
            break
        }
    }

    // Add required attribute indicators
    let requiredFields = try doc.select("input[required], select[required], textarea[required]")
    for field in requiredFields {
        try field.addClass("required-field")
    }

    // Normalize select options
    let selects = try doc.select("select")
    for select in selects {
        let options = try select.select("option")
        for option in options {
            let value = try option.attr("value")
            if value.isEmpty {
                try option.attr("value", try option.text())
            }
        }
    }

    return doc
}

Advanced Manipulation Techniques

Working with CSS Classes and Styling

Modify CSS classes to standardize styling or add semantic meaning:

func standardizeCSSClasses(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Standardize button classes
    let buttons = try doc.select("button, input[type='button'], input[type='submit']")
    for button in buttons {
        try button.addClass("btn")

        let buttonType = try button.attr("type")
        if buttonType == "submit" {
            try button.addClass("btn-primary")
        } else {
            try button.addClass("btn-secondary")
        }
    }

    // Add responsive image classes
    let images = try doc.select("img")
    for img in images {
        try img.addClass("img-responsive")

        // Add lazy loading attributes for performance
        if !try img.hasAttr("loading") {
            try img.attr("loading", "lazy")
        }
    }

    // Standardize link styling
    let externalLinks = try doc.select("a[href^='http']:not([href*='yourdomain.com'])")
    for link in externalLinks {
        try link.addClass("external-link")
        try link.attr("rel", "noopener nofollow")
    }

    return doc
}

Text Processing and Content Enhancement

Enhance text content for better data extraction:

func enhanceTextContent(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Convert phone numbers to clickable links
    let textElements = try doc.select("p, span, div:not(:has(*))")
    let phoneRegex = try NSRegularExpression(pattern: "\\b\\d{3}-\\d{3}-\\d{4}\\b")

    for element in textElements {
        let originalText = try element.text()
        let range = NSRange(location: 0, length: originalText.utf16.count)
        let matches = phoneRegex.matches(in: originalText, options: [], range: range)

        if !matches.isEmpty {
            var modifiedHTML = try element.html()
            for match in matches.reversed() {
                let matchRange = Range(match.range, in: originalText)!
                let phoneNumber = String(originalText[matchRange])
                let phoneLink = "<a href='tel:\(phoneNumber)'>\(phoneNumber)</a>"
                modifiedHTML = modifiedHTML.replacingOccurrences(of: phoneNumber, with: phoneLink)
            }
            try element.html(modifiedHTML)
        }
    }

    // Add semantic markup to dates
    let dateRegex = try NSRegularExpression(pattern: "\\b\\d{1,2}/\\d{1,2}/\\d{4}\\b")
    for element in textElements {
        let originalText = try element.text()
        let range = NSRange(location: 0, length: originalText.utf16.count)
        let matches = dateRegex.matches(in: originalText, options: [], range: range)

        if !matches.isEmpty {
            var modifiedHTML = try element.html()
            for match in matches.reversed() {
                let matchRange = Range(match.range, in: originalText)!
                let dateString = String(originalText[matchRange])
                let dateTime = "<time datetime='\(dateString)'>\(dateString)</time>"
                modifiedHTML = modifiedHTML.replacingOccurrences(of: dateString, with: dateTime)
            }
            try element.html(modifiedHTML)
        }
    }

    return doc
}

Real-World Use Cases

E-commerce Product Data Preprocessing

func preprocessProductPage(_ html: String) throws -> Document {
    let doc = try SwiftSoup.parse(html)

    // Remove promotional banners and ads
    try doc.select(".banner, .promotion, .ad, .popup").remove()

    // Standardize price display
    let priceElements = try doc.select(".price, .cost, [class*='price']")
    for priceElement in priceElements {
        let priceText = try priceElement.text()
        let numericPrice = priceText.replacingOccurrences(of: "[^0-9.]", with: "", options: .regularExpression)
        if !numericPrice.isEmpty {
            try priceElement.attr("data-numeric-price", numericPrice)
            try priceElement.addClass("standardized-price")
        }
    }

    // Enhance product images with consistent attributes
    let productImages = try doc.select("img[src*='product'], .product-image img")
    for img in productImages {
        try img.addClass("product-image")

        // Add zoom functionality indicator
        let src = try img.attr("src")
        if src.contains("thumbnail") || src.contains("small") {
            let largeImageSrc = src.replacingOccurrences(of: "thumbnail", with: "large")
                .replacingOccurrences(of: "small", with: "large")
            try img.attr("data-large-src", largeImageSrc)
        }
    }

    // Structure product reviews
    let reviews = try doc.select(".review, [class*='review']")
    for review in reviews {
        try review.addClass("structured-review")

        // Extract rating if present
        let ratingElements = try review.select("[class*='star'], [class*='rating']")
        if let ratingElement = ratingElements.first() {
            let ratingText = try ratingElement.text()
            let ratingMatch = ratingText.range(of: "\\d+", options: .regularExpression)
            if let match = ratingMatch {
                let rating = String(ratingText[match])
                try review.attr("data-rating", rating)
            }
        }
    }

    return doc
}

Conclusion

SwiftSoup's HTML modification capabilities make it an excellent choice for preprocessing HTML content before data extraction. Whether you need to clean up malformed HTML, remove unwanted elements, restructure content, or normalize data formats, SwiftSoup provides the tools necessary to transform raw HTML into a more suitable format for reliable data extraction.

The key to successful HTML modification lies in understanding your specific use case, implementing proper error handling, and optimizing for performance when working with large documents. By combining modification techniques with targeted data extraction, you can build robust and efficient iOS applications that handle complex web scraping scenarios.

For more advanced web scraping scenarios that require handling JavaScript-rendered content or complex authentication workflows, you might need to integrate SwiftSoup with other tools in your scraping pipeline.

Table of contents