Table of contents

How do I Create Custom Parsing Rules for Specific Websites?

Creating custom parsing rules for specific websites is essential for effective web scraping when each site has unique HTML structures and content patterns. Simple HTML DOM Parser provides the flexibility to create tailored extraction rules that can handle diverse website layouts and data formats.

Understanding Custom Parsing Rules

Custom parsing rules are site-specific instructions that define how to extract data from particular websites. These rules account for each site's unique HTML structure, CSS classes, element hierarchies, and content patterns. By creating custom rules, you can ensure reliable data extraction even when websites have complex or unconventional layouts.

Basic Custom Parser Structure

Here's a foundational approach to creating custom parsing rules:

<?php
require_once 'simple_html_dom.php';

class CustomWebsiteParser {
    private $rules = [];

    public function addRule($domain, $selectors) {
        $this->rules[$domain] = $selectors;
    }

    public function parse($url, $html) {
        $domain = parse_url($url, PHP_URL_HOST);

        if (!isset($this->rules[$domain])) {
            throw new Exception("No parsing rules defined for domain: $domain");
        }

        $dom = str_get_html($html);
        $rules = $this->rules[$domain];
        $result = [];

        foreach ($rules as $field => $selector) {
            $element = $dom->find($selector, 0);
            $result[$field] = $element ? trim($element->plaintext) : null;
        }

        $dom->clear();
        return $result;
    }
}

// Initialize parser
$parser = new CustomWebsiteParser();

// Define rules for news website
$parser->addRule('example-news.com', [
    'title' => 'h1.article-title',
    'author' => '.author-name',
    'date' => '.publish-date',
    'content' => '.article-body',
    'category' => '.category-tag'
]);

// Define rules for e-commerce site
$parser->addRule('example-shop.com', [
    'product_name' => 'h1.product-title',
    'price' => '.price-current',
    'description' => '.product-description',
    'availability' => '.stock-status',
    'rating' => '.rating-average'
]);
?>

Advanced Rule Configuration

For more complex websites, you can create sophisticated parsing rules that handle multiple scenarios:

<?php
class AdvancedWebsiteParser {
    private $config = [];

    public function addConfig($domain, $config) {
        $this->config[$domain] = $config;
    }

    public function parseWithConfig($url, $html) {
        $domain = parse_url($url, PHP_URL_HOST);
        $config = $this->config[$domain];
        $dom = str_get_html($html);
        $result = [];

        foreach ($config['fields'] as $field => $fieldConfig) {
            $result[$field] = $this->extractField($dom, $fieldConfig);
        }

        // Apply post-processing rules
        if (isset($config['post_process'])) {
            $result = $this->applyPostProcessing($result, $config['post_process']);
        }

        $dom->clear();
        return $result;
    }

    private function extractField($dom, $fieldConfig) {
        // Try multiple selectors until one works
        if (isset($fieldConfig['selectors'])) {
            foreach ($fieldConfig['selectors'] as $selector) {
                $element = $dom->find($selector, 0);
                if ($element) {
                    return $this->processElement($element, $fieldConfig);
                }
            }
        }

        return $fieldConfig['default'] ?? null;
    }

    private function processElement($element, $config) {
        $value = isset($config['attribute']) 
            ? $element->getAttribute($config['attribute'])
            : $element->plaintext;

        // Apply transformations
        if (isset($config['regex'])) {
            preg_match($config['regex'], $value, $matches);
            $value = $matches[1] ?? $value;
        }

        if (isset($config['trim'])) {
            $value = trim($value, $config['trim']);
        }

        return trim($value);
    }

    private function applyPostProcessing($data, $rules) {
        foreach ($rules as $rule) {
            if ($rule['type'] === 'date_format' && isset($data[$rule['field']])) {
                $data[$rule['field']] = date($rule['format'], strtotime($data[$rule['field']]));
            } elseif ($rule['type'] === 'price_format' && isset($data[$rule['field']])) {
                $data[$rule['field']] = (float) preg_replace('/[^\d.]/', '', $data[$rule['field']]);
            }
        }
        return $data;
    }
}

// Configuration for complex news site
$parser = new AdvancedWebsiteParser();
$parser->addConfig('complex-news.com', [
    'fields' => [
        'title' => [
            'selectors' => ['h1.main-headline', '.article-title', 'h1'],
            'trim' => ' \t\n\r'
        ],
        'publish_date' => [
            'selectors' => ['.publish-time', '.article-date', 'time'],
            'attribute' => 'datetime',
            'default' => date('Y-m-d')
        ],
        'price' => [
            'selectors' => ['.price-display', '.cost'],
            'regex' => '/\$([0-9,]+\.?\d*)/',
            'default' => 0
        ]
    ],
    'post_process' => [
        ['type' => 'date_format', 'field' => 'publish_date', 'format' => 'Y-m-d'],
        ['type' => 'price_format', 'field' => 'price']
    ]
]);
?>

JavaScript Implementation

For client-side parsing or Node.js applications, you can implement similar custom rules:

class CustomWebsiteParser {
    constructor() {
        this.rules = new Map();
    }

    addRule(domain, selectors) {
        this.rules.set(domain, selectors);
    }

    parse(url, document) {
        const domain = new URL(url).hostname;
        const rules = this.rules.get(domain);

        if (!rules) {
            throw new Error(`No parsing rules defined for domain: ${domain}`);
        }

        const result = {};

        for (const [field, selector] of Object.entries(rules)) {
            const element = document.querySelector(selector);
            result[field] = element ? element.textContent.trim() : null;
        }

        return result;
    }
}

// Usage example
const parser = new CustomWebsiteParser();

// Define rules for blog site
parser.addRule('example-blog.com', {
    title: 'h1.post-title',
    author: '.author-info .name',
    content: '.post-content',
    tags: '.tag-list .tag',
    comments_count: '.comments-counter'
});

// For dynamic content, you might need to handle AJAX requests
// Learn more about handling AJAX requests using Puppeteer: /faq/puppeteer/how-to-handle-ajax-requests-using-puppeteer

Handling Dynamic Content

Many modern websites load content dynamically, requiring special handling:

<?php
class DynamicContentParser extends CustomWebsiteParser {
    public function parseWithWait($url, $waitSelector = null) {
        // This would typically integrate with a headless browser
        // For complex SPAs, consider using Puppeteer instead

        $html = $this->fetchWithDelay($url, $waitSelector);
        return $this->parse($url, $html);
    }

    private function fetchWithDelay($url, $waitSelector) {
        // Simulate waiting for content to load
        // In practice, you'd use tools like Puppeteer for this
        $context = stream_context_create([
            'http' => [
                'timeout' => 30,
                'user_agent' => 'Mozilla/5.0 (compatible; CustomParser/1.0)'
            ]
        ]);

        return file_get_contents($url, false, $context);
    }
}
?>

Error Handling and Fallbacks

Robust custom parsing rules should include comprehensive error handling:

<?php
class RobustWebsiteParser extends AdvancedWebsiteParser {
    public function parseWithFallbacks($url, $html) {
        try {
            return parent::parseWithConfig($url, $html);
        } catch (Exception $e) {
            return $this->applyFallbackRules($url, $html);
        }
    }

    private function applyFallbackRules($url, $html) {
        $dom = str_get_html($html);
        $result = [];

        // Generic fallback selectors
        $fallbacks = [
            'title' => ['h1', '.title', '[class*="title"]', 'title'],
            'content' => ['.content', '.article', '.post', 'main'],
            'date' => ['time', '.date', '[class*="date"]']
        ];

        foreach ($fallbacks as $field => $selectors) {
            foreach ($selectors as $selector) {
                $element = $dom->find($selector, 0);
                if ($element && !empty(trim($element->plaintext))) {
                    $result[$field] = trim($element->plaintext);
                    break;
                }
            }
        }

        $dom->clear();
        return $result;
    }
}
?>

Testing Custom Rules

Always test your parsing rules thoroughly:

<?php
class ParserTester {
    private $parser;
    private $testCases = [];

    public function __construct($parser) {
        $this->parser = $parser;
    }

    public function addTestCase($url, $expectedFields) {
        $this->testCases[] = ['url' => $url, 'expected' => $expectedFields];
    }

    public function runTests() {
        $results = [];

        foreach ($this->testCases as $test) {
            $html = file_get_contents($test['url']);
            $parsed = $this->parser->parse($test['url'], $html);

            $success = true;
            foreach ($test['expected'] as $field => $expected) {
                if (!isset($parsed[$field]) || $parsed[$field] !== $expected) {
                    $success = false;
                    break;
                }
            }

            $results[] = [
                'url' => $test['url'],
                'success' => $success,
                'parsed' => $parsed
            ];
        }

        return $results;
    }
}

// Test your rules
$tester = new ParserTester($parser);
$tester->addTestCase('https://example-news.com/article/1', [
    'title' => 'Expected Article Title',
    'author' => 'John Doe'
]);

$testResults = $tester->runTests();
?>

Best Practices for Custom Parsing Rules

  1. Start Simple: Begin with basic selectors and gradually add complexity
  2. Use Multiple Selectors: Provide fallback selectors for reliability
  3. Test Regularly: Websites change frequently, so test your rules often
  4. Handle Errors Gracefully: Always include error handling and fallbacks
  5. Document Your Rules: Maintain clear documentation of what each rule does
  6. Monitor Performance: Complex rules can slow down parsing

Integration with Other Tools

For websites with heavy JavaScript or complex interactions, consider combining Simple HTML DOM with more powerful tools. You can navigate to different pages using Puppeteer to handle dynamic content, then use Simple HTML DOM for the actual parsing once the content is loaded.

Conclusion

Creating custom parsing rules for specific websites requires understanding each site's unique structure and implementing flexible, robust extraction logic. By following the patterns shown above, you can build maintainable parsing systems that handle various website types while providing reliable data extraction. Remember to test your rules regularly and include proper error handling to ensure your scraping operations remain stable over time.

For complex single-page applications, you might also want to learn about how to crawl a single page application (SPA) using Puppeteer to complement your custom parsing rules with dynamic content handling capabilities.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon