How to Handle AJAX Requests in Puppeteer
Puppeteer provides powerful capabilities for handling AJAX requests, which is essential for scraping dynamic websites that load content asynchronously. This guide covers monitoring, intercepting, and waiting for AJAX requests.
Basic AJAX Request Monitoring
1. Setup and Installation
npm install puppeteer
2. Monitor All AJAX Requests
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Enable request interception
await page.setRequestInterception(true);
// Monitor AJAX requests
page.on('request', (request) => {
if (request.resourceType() === 'xhr' || request.resourceType() === 'fetch') {
console.log('AJAX Request:', {
url: request.url(),
method: request.method(),
headers: request.headers(),
postData: request.postData()
});
}
request.continue();
});
// Monitor responses
page.on('response', (response) => {
if (response.request().resourceType() === 'xhr' || response.request().resourceType() === 'fetch') {
console.log('AJAX Response:', {
url: response.url(),
status: response.status(),
headers: response.headers()
});
}
});
await page.goto('https://example.com');
await page.waitForTimeout(5000); // Wait for AJAX requests
await browser.close();
})();
Intercepting and Modifying AJAX Requests
Block Specific AJAX Requests
page.on('request', (request) => {
if (request.resourceType() === 'xhr' && request.url().includes('analytics')) {
// Block analytics requests
request.abort();
} else if (request.resourceType() === 'xhr') {
console.log('Allowing AJAX request:', request.url());
request.continue();
} else {
request.continue();
}
});
Modify Request Headers
page.on('request', (request) => {
if (request.resourceType() === 'xhr') {
request.continue({
headers: {
...request.headers(),
'Authorization': 'Bearer your-token',
'Custom-Header': 'custom-value'
}
});
} else {
request.continue();
}
});
Waiting for AJAX Requests
Wait for Specific Network Activity
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Wait for specific AJAX endpoint
const responsePromise = page.waitForResponse(response =>
response.url().includes('/api/data') && response.status() === 200
);
await page.goto('https://example.com');
// Trigger AJAX request
await page.click('#load-data-button');
// Wait for the response
const response = await responsePromise;
const data = await response.json();
console.log('AJAX data:', data);
await browser.close();
})();
Wait for Network Idle
// Wait for network to be idle (no requests for 500ms)
await page.goto('https://example.com', {
waitUntil: 'networkidle0'
});
// Or wait for most requests to finish
await page.goto('https://example.com', {
waitUntil: 'networkidle2'
});
Advanced AJAX Handling Techniques
Capture AJAX Response Data
const capturedData = {};
page.on('response', async (response) => {
if (response.request().resourceType() === 'xhr' &&
response.url().includes('/api/')) {
try {
const data = await response.json();
capturedData[response.url()] = data;
console.log('Captured AJAX data:', data);
} catch (error) {
console.log('Non-JSON response from:', response.url());
}
}
});
Wait for Multiple AJAX Requests
async function waitForMultipleRequests(page, urls, timeout = 30000) {
const promises = urls.map(url =>
page.waitForResponse(response =>
response.url().includes(url) && response.status() === 200,
{ timeout }
)
);
try {
const responses = await Promise.all(promises);
return responses;
} catch (error) {
console.log('Some requests timed out:', error.message);
return [];
}
}
// Usage
const responses = await waitForMultipleRequests(page, [
'/api/user',
'/api/posts',
'/api/comments'
]);
Handle Dynamic Content Loading
async function waitForDynamicContent(page, selector, maxWait = 10000) {
const startTime = Date.now();
while (Date.now() - startTime < maxWait) {
const element = await page.$(selector);
if (element) {
const text = await element.evaluate(el => el.textContent);
if (text && text.trim() !== 'Loading...') {
return text;
}
}
await page.waitForTimeout(100);
}
throw new Error(`Content not loaded within ${maxWait}ms`);
}
// Usage
await page.goto('https://example.com');
await page.click('#load-content');
const content = await waitForDynamicContent(page, '#dynamic-content');
Complete Example: E-commerce Product Scraper
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
const products = [];
// Intercept product data AJAX requests
page.on('response', async (response) => {
if (response.url().includes('/api/products') && response.status() === 200) {
try {
const data = await response.json();
products.push(...data.products);
console.log(`Captured ${data.products.length} products`);
} catch (error) {
console.log('Error parsing product data:', error);
}
}
});
await page.goto('https://example-shop.com');
// Load more products by scrolling
let previousHeight = 0;
while (true) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
// Wait for new content to load
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) break;
previousHeight = currentHeight;
}
console.log(`Total products captured: ${products.length}`);
await browser.close();
})();
Best Practices
- Always handle errors when parsing AJAX responses
- Set appropriate timeouts to avoid hanging scripts
- Use
networkidle0
ornetworkidle2
for pages with heavy AJAX activity - Monitor both XHR and Fetch requests for complete coverage
- Be respectful of rate limits when intercepting requests
- Clean up resources by closing browsers properly
This comprehensive approach to AJAX handling will help you effectively scrape dynamic websites and capture asynchronous data loading patterns.