Running multiple pages in parallel with Puppeteer dramatically improves performance when scraping multiple URLs or performing bulk automation tasks. There are several approaches depending on your programming language and use case.
JavaScript: Multiple Pages in One Browser
The most efficient approach is to open multiple pages within a single browser instance:
const puppeteer = require('puppeteer');
async function scrapeMultiplePages() {
const browser = await puppeteer.launch();
const urls = [
'https://example.com',
'https://example.org',
'https://example.net'
];
// Create multiple pages
const pages = await Promise.all(
urls.map(() => browser.newPage())
);
// Navigate all pages in parallel
await Promise.all(
pages.map((page, index) => page.goto(urls[index]))
);
// Extract data from all pages in parallel
const results = await Promise.all(
pages.map(async (page) => {
const title = await page.title();
const content = await page.$eval('body', el => el.textContent);
return { title, url: page.url(), content: content.slice(0, 100) };
})
);
await browser.close();
return results;
}
scrapeMultiplePages()
.then(results => console.log(results))
.catch(console.error);
JavaScript: Pool of Browser Pages
For better resource management with many URLs, use a page pool:
const puppeteer = require('puppeteer');
class PagePool {
constructor(browser, poolSize = 5) {
this.browser = browser;
this.poolSize = poolSize;
this.pages = [];
this.busy = new Set();
}
async init() {
for (let i = 0; i < this.poolSize; i++) {
const page = await this.browser.newPage();
this.pages.push(page);
}
}
async getPage() {
const availablePage = this.pages.find(page => !this.busy.has(page));
if (availablePage) {
this.busy.add(availablePage);
return availablePage;
}
// Wait for a page to become available
await new Promise(resolve => setTimeout(resolve, 100));
return this.getPage();
}
releasePage(page) {
this.busy.delete(page);
}
async close() {
await Promise.all(this.pages.map(page => page.close()));
}
}
async function scrapeWithPool(urls) {
const browser = await puppeteer.launch();
const pool = new PagePool(browser, 5);
await pool.init();
const results = await Promise.all(
urls.map(async (url) => {
const page = await pool.getPage();
try {
await page.goto(url);
const title = await page.title();
return { url, title };
} finally {
pool.releasePage(page);
}
})
);
await pool.close();
await browser.close();
return results;
}
JavaScript: Concurrent Processing with Limits
For processing hundreds of URLs without overwhelming the system:
const puppeteer = require('puppeteer');
async function processBatch(urls, concurrency = 10) {
const browser = await puppeteer.launch();
const results = [];
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.all(
batch.map(async (url) => {
const page = await browser.newPage();
try {
await page.goto(url, { waitUntil: 'networkidle2' });
const title = await page.title();
const screenshot = await page.screenshot({ type: 'png' });
return { url, title, screenshot };
} catch (error) {
console.error(`Error processing ${url}:`, error.message);
return { url, error: error.message };
} finally {
await page.close();
}
})
);
results.push(...batchResults);
console.log(`Processed batch ${Math.floor(i/concurrency) + 1}`);
}
await browser.close();
return results;
}
// Usage
const urls = Array.from({length: 50}, (_, i) => `https://example.com/page${i}`);
processBatch(urls, 5).then(console.log);
Python with Pyppeteer
For Python users, Pyppeteer provides similar functionality:
import asyncio
from pyppeteer import launch
async def scrape_page(browser, url):
"""Scrape a single page using a shared browser instance"""
page = await browser.newPage()
try:
await page.goto(url)
title = await page.title()
content = await page.content()
return {'url': url, 'title': title, 'content_length': len(content)}
except Exception as e:
return {'url': url, 'error': str(e)}
finally:
await page.close()
async def scrape_multiple_pages(urls):
"""Scrape multiple pages in parallel"""
browser = await launch(headless=True)
try:
# Create tasks for all URLs
tasks = [scrape_page(browser, url) for url in urls]
# Execute all tasks in parallel
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
finally:
await browser.close()
async def main():
urls = [
'https://example.com',
'https://example.org',
'https://example.net',
'https://httpbin.org/delay/1',
'https://httpbin.org/delay/2'
]
results = await scrape_multiple_pages(urls)
for result in results:
if isinstance(result, dict) and 'error' not in result:
print(f"✓ {result['title']} - {result['url']}")
else:
print(f"✗ Error: {result}")
# Run the scraper
asyncio.run(main())
Best Practices and Performance Tips
Resource Management
- Use page pools for large-scale scraping (5-10 concurrent pages)
- Limit concurrent pages to avoid memory issues
- Reuse browser instances instead of creating new ones for each page
- Close pages when done to free memory
Error Handling
async function robustScraping(urls) {
const browser = await puppeteer.launch();
const results = [];
for (const url of urls) {
const page = await browser.newPage();
try {
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
const data = await page.evaluate(() => ({
title: document.title,
links: Array.from(document.links).length
}));
results.push({ url, success: true, data });
} catch (error) {
results.push({ url, success: false, error: error.message });
} finally {
await page.close();
}
// Small delay to avoid overwhelming the server
await new Promise(resolve => setTimeout(resolve, 100));
}
await browser.close();
return results;
}
Performance Optimization
- Disable images and CSS for faster loading:
await page.setRequestInterception(true)
- Use networkidle2 instead of networkidle0 for better performance
- Set appropriate timeouts to handle slow pages
- Implement retry logic for failed requests
Running pages in parallel significantly reduces scraping time, but always consider the target server's capacity and implement proper rate limiting to avoid being blocked.