How to handle AJAX requests using Puppeteer?

How to Handle AJAX Requests in Puppeteer

Puppeteer provides powerful capabilities for handling AJAX requests, which is essential for scraping dynamic websites that load content asynchronously. This guide covers monitoring, intercepting, and waiting for AJAX requests.

Basic AJAX Request Monitoring

1. Setup and Installation

npm install puppeteer

2. Monitor All AJAX Requests

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();

    // Enable request interception
    await page.setRequestInterception(true);

    // Monitor AJAX requests
    page.on('request', (request) => {
        if (request.resourceType() === 'xhr' || request.resourceType() === 'fetch') {
            console.log('AJAX Request:', {
                url: request.url(),
                method: request.method(),
                headers: request.headers(),
                postData: request.postData()
            });
        }
        request.continue();
    });

    // Monitor responses
    page.on('response', (response) => {
        if (response.request().resourceType() === 'xhr' || response.request().resourceType() === 'fetch') {
            console.log('AJAX Response:', {
                url: response.url(),
                status: response.status(),
                headers: response.headers()
            });
        }
    });

    await page.goto('https://example.com');
    await page.waitForTimeout(5000); // Wait for AJAX requests
    await browser.close();
})();

Intercepting and Modifying AJAX Requests

Block Specific AJAX Requests

page.on('request', (request) => {
    if (request.resourceType() === 'xhr' && request.url().includes('analytics')) {
        // Block analytics requests
        request.abort();
    } else if (request.resourceType() === 'xhr') {
        console.log('Allowing AJAX request:', request.url());
        request.continue();
    } else {
        request.continue();
    }
});

Modify Request Headers

page.on('request', (request) => {
    if (request.resourceType() === 'xhr') {
        request.continue({
            headers: {
                ...request.headers(),
                'Authorization': 'Bearer your-token',
                'Custom-Header': 'custom-value'
            }
        });
    } else {
        request.continue();
    }
});

Waiting for AJAX Requests

Wait for Specific Network Activity

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();

    // Wait for specific AJAX endpoint
    const responsePromise = page.waitForResponse(response => 
        response.url().includes('/api/data') && response.status() === 200
    );

    await page.goto('https://example.com');

    // Trigger AJAX request
    await page.click('#load-data-button');

    // Wait for the response
    const response = await responsePromise;
    const data = await response.json();
    console.log('AJAX data:', data);

    await browser.close();
})();

Wait for Network Idle

// Wait for network to be idle (no requests for 500ms)
await page.goto('https://example.com', { 
    waitUntil: 'networkidle0' 
});

// Or wait for most requests to finish
await page.goto('https://example.com', { 
    waitUntil: 'networkidle2' 
});

Advanced AJAX Handling Techniques

Capture AJAX Response Data

const capturedData = {};

page.on('response', async (response) => {
    if (response.request().resourceType() === 'xhr' && 
        response.url().includes('/api/')) {
        try {
            const data = await response.json();
            capturedData[response.url()] = data;
            console.log('Captured AJAX data:', data);
        } catch (error) {
            console.log('Non-JSON response from:', response.url());
        }
    }
});

Wait for Multiple AJAX Requests

async function waitForMultipleRequests(page, urls, timeout = 30000) {
    const promises = urls.map(url => 
        page.waitForResponse(response => 
            response.url().includes(url) && response.status() === 200,
            { timeout }
        )
    );

    try {
        const responses = await Promise.all(promises);
        return responses;
    } catch (error) {
        console.log('Some requests timed out:', error.message);
        return [];
    }
}

// Usage
const responses = await waitForMultipleRequests(page, [
    '/api/user',
    '/api/posts',
    '/api/comments'
]);

Handle Dynamic Content Loading

async function waitForDynamicContent(page, selector, maxWait = 10000) {
    const startTime = Date.now();

    while (Date.now() - startTime < maxWait) {
        const element = await page.$(selector);
        if (element) {
            const text = await element.evaluate(el => el.textContent);
            if (text && text.trim() !== 'Loading...') {
                return text;
            }
        }
        await page.waitForTimeout(100);
    }

    throw new Error(`Content not loaded within ${maxWait}ms`);
}

// Usage
await page.goto('https://example.com');
await page.click('#load-content');
const content = await waitForDynamicContent(page, '#dynamic-content');

Complete Example: E-commerce Product Scraper

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    const products = [];

    // Intercept product data AJAX requests
    page.on('response', async (response) => {
        if (response.url().includes('/api/products') && response.status() === 200) {
            try {
                const data = await response.json();
                products.push(...data.products);
                console.log(`Captured ${data.products.length} products`);
            } catch (error) {
                console.log('Error parsing product data:', error);
            }
        }
    });

    await page.goto('https://example-shop.com');

    // Load more products by scrolling
    let previousHeight = 0;
    while (true) {
        await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));

        // Wait for new content to load
        await page.waitForTimeout(2000);

        const currentHeight = await page.evaluate(() => document.body.scrollHeight);
        if (currentHeight === previousHeight) break;

        previousHeight = currentHeight;
    }

    console.log(`Total products captured: ${products.length}`);
    await browser.close();
})();

Best Practices

  1. Always handle errors when parsing AJAX responses
  2. Set appropriate timeouts to avoid hanging scripts
  3. Use networkidle0 or networkidle2 for pages with heavy AJAX activity
  4. Monitor both XHR and Fetch requests for complete coverage
  5. Be respectful of rate limits when intercepting requests
  6. Clean up resources by closing browsers properly

This comprehensive approach to AJAX handling will help you effectively scrape dynamic websites and capture asynchronous data loading patterns.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon