How do I Extract Specific Text Patterns Using Simple HTML DOM?

Extracting specific text patterns from HTML documents is a common requirement in web scraping projects. Simple HTML DOM Parser provides several powerful methods to locate and extract text patterns using CSS selectors, regular expressions, and built-in filtering functions. This guide covers various techniques for pattern extraction with practical code examples.

Understanding Simple HTML DOM Pattern Extraction

Simple HTML DOM Parser offers multiple approaches for extracting text patterns:

CSS Selector-based extraction for structured content
Regular expression matching for complex patterns
Attribute-based filtering for specific element properties
Text content manipulation for clean data extraction

Basic Text Pattern Extraction

Installing Simple HTML DOM Parser

First, ensure you have Simple HTML DOM Parser installed:

<?php
// Download simple_html_dom.php from https://simplehtmldom.sourceforge.io/
require_once 'simple_html_dom.php';

Basic Pattern Extraction Example

<?php
require_once 'simple_html_dom.php';

// Load HTML content
$html = file_get_html('https://example.com');

// Extract all email addresses from the page
$emails = [];
foreach($html->find('*') as $element) {
    $text = $element->plaintext;
    preg_match_all('/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/', $text, $matches);
    $emails = array_merge($emails, $matches[0]);
}

// Remove duplicates and display results
$unique_emails = array_unique($emails);
print_r($unique_emails);
?>

CSS Selector-Based Pattern Extraction

Extracting Specific Element Patterns

Use CSS selectors to target elements containing specific text patterns:

<?php
// Extract all product prices
$prices = [];
foreach($html->find('.price, .cost, .amount') as $price_element) {
    $price_text = $price_element->plaintext;
    // Extract price pattern (currency symbol + numbers)
    if (preg_match('/[\$£€¥]\s*[\d,]+\.?\d*/', $price_text, $match)) {
        $prices[] = trim($match[0]);
    }
}

// Extract phone numbers from contact sections
$phone_numbers = [];
foreach($html->find('.contact, .phone, .tel') as $contact_element) {
    $contact_text = $contact_element->plaintext;
    // Match various phone number formats
    $phone_pattern = '/(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/';
    if (preg_match_all($phone_pattern, $contact_text, $matches)) {
        $phone_numbers = array_merge($phone_numbers, $matches[0]);
    }
}
?>

Advanced CSS Selector Patterns

<?php
// Extract dates from article timestamps
$dates = [];
foreach($html->find('time[datetime], .date, .timestamp') as $date_element) {
    $date_text = $date_element->plaintext;
    $datetime_attr = $date_element->datetime;

    // Try to extract from datetime attribute first
    if ($datetime_attr) {
        $dates[] = $datetime_attr;
    } else {
        // Extract date patterns from text
        $date_patterns = [
            '/\d{4}-\d{2}-\d{2}/',           // YYYY-MM-DD
            '/\d{1,2}\/\d{1,2}\/\d{4}/',     // MM/DD/YYYY
            '/\d{1,2}-\d{1,2}-\d{4}/',       // MM-DD-YYYY
            '/\w+\s+\d{1,2},?\s+\d{4}/'      // Month DD, YYYY
        ];

        foreach ($date_patterns as $pattern) {
            if (preg_match($pattern, $date_text, $match)) {
                $dates[] = trim($match[0]);
                break;
            }
        }
    }
}
?>

Regular Expression Pattern Extraction

Common Text Patterns

Here are examples of extracting common text patterns:

<?php
class PatternExtractor {
    private $html;

    public function __construct($html_content) {
        $this->html = $html_content;
    }

    // Extract URLs
    public function extractUrls() {
        $urls = [];
        $url_pattern = '/https?:\/\/(?:[-\w.])+(?:[:\d]+)?(?:\/(?:[\w\/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?/';

        foreach($this->html->find('*') as $element) {
            if (preg_match_all($url_pattern, $element->plaintext, $matches)) {
                $urls = array_merge($urls, $matches[0]);
            }
        }
        return array_unique($urls);
    }

    // Extract social security numbers (masked for privacy)
    public function extractSSNPatterns() {
        $ssn_patterns = [];
        $ssn_pattern = '/\d{3}-\d{2}-\d{4}|\d{3}\s\d{2}\s\d{4}/';

        foreach($this->html->find('*') as $element) {
            if (preg_match_all($ssn_pattern, $element->plaintext, $matches)) {
                // Mask for privacy: XXX-XX-1234
                foreach($matches[0] as $ssn) {
                    $masked = 'XXX-XX-' . substr($ssn, -4);
                    $ssn_patterns[] = $masked;
                }
            }
        }
        return array_unique($ssn_patterns);
    }

    // Extract hashtags
    public function extractHashtags() {
        $hashtags = [];
        $hashtag_pattern = '/#[a-zA-Z0-9_]+/';

        foreach($this->html->find('*') as $element) {
            if (preg_match_all($hashtag_pattern, $element->plaintext, $matches)) {
                $hashtags = array_merge($hashtags, $matches[0]);
            }
        }
        return array_unique($hashtags);
    }
}

// Usage
$extractor = new PatternExtractor($html);
$urls = $extractor->extractUrls();
$hashtags = $extractor->extractHashtags();
?>

Advanced Pattern Extraction Techniques

Combining CSS Selectors with Regex

<?php
// Extract product codes from specific sections
function extractProductCodes($html) {
    $product_codes = [];

    // Look in product listing areas
    $product_sections = $html->find('.product, .item, .listing, [data-product]');

    foreach($product_sections as $section) {
        $text = $section->plaintext;

        // Match various product code patterns
        $patterns = [
            '/SKU\s*:?\s*([A-Z0-9-]+)/i',           // SKU: ABC-123
            '/Model\s*:?\s*([A-Z0-9-]+)/i',         // Model: XYZ-456
            '/Code\s*:?\s*([A-Z0-9-]+)/i',          // Code: DEF-789
            '/Part\s*#?\s*:?\s*([A-Z0-9-]+)/i'      // Part #: GHI-012
        ];

        foreach($patterns as $pattern) {
            if (preg_match_all($pattern, $text, $matches)) {
                $product_codes = array_merge($product_codes, $matches[1]);
            }
        }
    }

    return array_unique($product_codes);
}

// Extract version numbers from software pages
function extractVersionNumbers($html) {
    $versions = [];

    foreach($html->find('.version, .release, .download') as $element) {
        $text = $element->plaintext;

        // Match semantic versioning patterns
        $version_pattern = '/v?(\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9]+)?)/';

        if (preg_match_all($version_pattern, $text, $matches)) {
            $versions = array_merge($versions, $matches[1]);
        }
    }

    return array_unique($versions);
}
?>

Pattern Extraction with Context

<?php
// Extract prices with their corresponding product names
function extractPricesWithContext($html) {
    $price_data = [];

    foreach($html->find('.product, .item') as $product) {
        $product_name = '';
        $price = '';

        // Try to find product name
        $name_element = $product->find('.name, .title, h1, h2, h3', 0);
        if ($name_element) {
            $product_name = trim($name_element->plaintext);
        }

        // Try to find price
        $price_element = $product->find('.price, .cost, .amount', 0);
        if ($price_element) {
            $price_text = $price_element->plaintext;
            if (preg_match('/[\$£€¥]\s*[\d,]+\.?\d*/', $price_text, $match)) {
                $price = trim($match[0]);
            }
        }

        if ($product_name && $price) {
            $price_data[] = [
                'name' => $product_name,
                'price' => $price
            ];
        }
    }

    return $price_data;
}
?>

Cleaning and Validating Extracted Patterns

Text Cleaning Functions

<?php
class TextCleaner {
    // Clean and validate email addresses
    public static function cleanEmails($emails) {
        $clean_emails = [];

        foreach($emails as $email) {
            $email = trim(strtolower($email));
            if (filter_var($email, FILTER_VALIDATE_EMAIL)) {
                $clean_emails[] = $email;
            }
        }

        return array_unique($clean_emails);
    }

    // Clean phone numbers to standard format
    public static function cleanPhoneNumbers($phones) {
        $clean_phones = [];

        foreach($phones as $phone) {
            // Remove all non-digit characters
            $digits = preg_replace('/\D/', '', $phone);

            // Format as (XXX) XXX-XXXX for US numbers
            if (strlen($digits) === 10) {
                $formatted = sprintf('(%s) %s-%s', 
                    substr($digits, 0, 3),
                    substr($digits, 3, 3),
                    substr($digits, 6, 4)
                );
                $clean_phones[] = $formatted;
            } elseif (strlen($digits) === 11 && $digits[0] === '1') {
                // Remove leading 1 for US numbers
                $digits = substr($digits, 1);
                $formatted = sprintf('(%s) %s-%s', 
                    substr($digits, 0, 3),
                    substr($digits, 3, 3),
                    substr($digits, 6, 4)
                );
                $clean_phones[] = $formatted;
            }
        }

        return array_unique($clean_phones);
    }

    // Normalize currency values
    public static function cleanPrices($prices) {
        $clean_prices = [];

        foreach($prices as $price) {
            // Extract numeric value
            $numeric = preg_replace('/[^\d.]/', '', $price);
            if (is_numeric($numeric)) {
                $clean_prices[] = floatval($numeric);
            }
        }

        return $clean_prices;
    }
}
?>

Error Handling and Best Practices

Robust Pattern Extraction

<?php
function safePatternExtraction($url, $patterns) {
    try {
        // Set context options for better reliability
        $context = stream_context_create([
            'http' => [
                'timeout' => 30,
                'user_agent' => 'Mozilla/5.0 (compatible; WebScraper/1.0)'
            ]
        ]);

        $html = file_get_html($url, false, $context);

        if (!$html) {
            throw new Exception("Failed to load HTML from: $url");
        }

        $results = [];

        foreach($patterns as $name => $config) {
            $results[$name] = [];

            try {
                $elements = $html->find($config['selector']);

                foreach($elements as $element) {
                    $text = $element->plaintext;

                    if (preg_match_all($config['pattern'], $text, $matches)) {
                        $results[$name] = array_merge($results[$name], $matches[0]);
                    }
                }

                // Apply cleaning function if specified
                if (isset($config['cleaner']) && function_exists($config['cleaner'])) {
                    $results[$name] = $config['cleaner']($results[$name]);
                }

                $results[$name] = array_unique($results[$name]);

            } catch (Exception $e) {
                error_log("Pattern extraction error for '$name': " . $e->getMessage());
                $results[$name] = [];
            }
        }

        $html->clear();
        unset($html);

        return $results;

    } catch (Exception $e) {
        error_log("Pattern extraction failed: " . $e->getMessage());
        return false;
    }
}

// Usage example
$patterns = [
    'emails' => [
        'selector' => '*',
        'pattern' => '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/',
        'cleaner' => 'TextCleaner::cleanEmails'
    ],
    'phones' => [
        'selector' => '.contact, .phone',
        'pattern' => '/(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})/',
        'cleaner' => 'TextCleaner::cleanPhoneNumbers'
    ],
    'prices' => [
        'selector' => '.price, .cost',
        'pattern' => '/[\$£€¥]\s*[\d,]+\.?\d*/',
        'cleaner' => 'TextCleaner::cleanPrices'
    ]
];

$extracted_data = safePatternExtraction('https://example.com', $patterns);
?>

JavaScript Integration for Dynamic Content

While Simple HTML DOM Parser works great with static HTML, some websites require JavaScript execution to load content. For these cases, you can combine it with tools like Puppeteer:

// Example: Using Puppeteer to get HTML, then process with Simple HTML DOM
const puppeteer = require('puppeteer');

async function getHtmlForPatternExtraction(url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();

    await page.goto(url, { waitUntil: 'networkidle2' });

    // Wait for dynamic content to load
    await page.waitForTimeout(2000);

    const html = await page.content();
    await browser.close();

    return html;
}

// Use this HTML content with Simple HTML DOM Parser in PHP

For more complex scenarios involving dynamic content, consider using Puppeteer for handling JavaScript-rendered pages or managing authentication flows when dealing with protected content.

Performance Optimization Tips

When extracting patterns from large HTML documents:

Use specific CSS selectors to limit the scope of pattern matching
Compile regular expressions outside of loops when possible
Process elements in chunks for large documents
Cache frequently used patterns to avoid recompilation
Clean up DOM objects to prevent memory leaks

Batch Processing Example

<?php
function batchPatternExtraction($urls, $patterns, $batch_size = 10) {
    $all_results = [];
    $url_chunks = array_chunk($urls, $batch_size);

    foreach ($url_chunks as $chunk) {
        foreach ($chunk as $url) {
            $results = safePatternExtraction($url, $patterns);
            if ($results) {
                $all_results[$url] = $results;
            }

            // Small delay to be respectful to servers
            usleep(500000); // 0.5 seconds
        }

        // Garbage collection after each batch
        gc_collect_cycles();
    }

    return $all_results;
}
?>

Common Pattern Examples

Credit Card Numbers (for PCI compliance checking)

<?php
function extractCreditCardPatterns($html) {
    $cc_patterns = [];
    $cc_regex = '/\b(?:\d{4}[\s-]?){3}\d{4}\b/';

    foreach($html->find('*') as $element) {
        if (preg_match_all($cc_regex, $element->plaintext, $matches)) {
            // Mask all but last 4 digits for security
            foreach($matches[0] as $cc) {
                $masked = 'XXXX-XXXX-XXXX-' . substr(preg_replace('/\D/', '', $cc), -4);
                $cc_patterns[] = $masked;
            }
        }
    }

    return array_unique($cc_patterns);
}
?>

IPv4 Addresses

<?php
function extractIPAddresses($html) {
    $ips = [];
    $ip_pattern = '/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/';

    foreach($html->find('*') as $element) {
        if (preg_match_all($ip_pattern, $element->plaintext, $matches)) {
            $ips = array_merge($ips, $matches[0]);
        }
    }

    return array_unique($ips);
}
?>

Conclusion

Simple HTML DOM Parser provides powerful capabilities for extracting specific text patterns from HTML documents. By combining CSS selectors with regular expressions and implementing proper error handling, you can create robust pattern extraction systems for various web scraping needs. Remember to always validate and clean extracted data, handle errors gracefully, and respect website terms of service when scraping content.

The techniques covered in this guide will help you efficiently extract emails, phone numbers, prices, dates, and custom patterns from any HTML content using Simple HTML DOM Parser's flexible API.

Table of contents