Table of contents

How to run multiple pages in parallel with Puppeteer?

Running multiple pages in parallel with Puppeteer dramatically improves performance when scraping multiple URLs or performing bulk automation tasks. There are several approaches depending on your programming language and use case.

JavaScript: Multiple Pages in One Browser

The most efficient approach is to open multiple pages within a single browser instance:

const puppeteer = require('puppeteer');

async function scrapeMultiplePages() {
    const browser = await puppeteer.launch();

    const urls = [
        'https://example.com',
        'https://example.org',
        'https://example.net'
    ];

    // Create multiple pages
    const pages = await Promise.all(
        urls.map(() => browser.newPage())
    );

    // Navigate all pages in parallel
    await Promise.all(
        pages.map((page, index) => page.goto(urls[index]))
    );

    // Extract data from all pages in parallel
    const results = await Promise.all(
        pages.map(async (page) => {
            const title = await page.title();
            const content = await page.$eval('body', el => el.textContent);
            return { title, url: page.url(), content: content.slice(0, 100) };
        })
    );

    await browser.close();
    return results;
}

scrapeMultiplePages()
    .then(results => console.log(results))
    .catch(console.error);

JavaScript: Pool of Browser Pages

For better resource management with many URLs, use a page pool:

const puppeteer = require('puppeteer');

class PagePool {
    constructor(browser, poolSize = 5) {
        this.browser = browser;
        this.poolSize = poolSize;
        this.pages = [];
        this.busy = new Set();
    }

    async init() {
        for (let i = 0; i < this.poolSize; i++) {
            const page = await this.browser.newPage();
            this.pages.push(page);
        }
    }

    async getPage() {
        const availablePage = this.pages.find(page => !this.busy.has(page));
        if (availablePage) {
            this.busy.add(availablePage);
            return availablePage;
        }
        // Wait for a page to become available
        await new Promise(resolve => setTimeout(resolve, 100));
        return this.getPage();
    }

    releasePage(page) {
        this.busy.delete(page);
    }

    async close() {
        await Promise.all(this.pages.map(page => page.close()));
    }
}

async function scrapeWithPool(urls) {
    const browser = await puppeteer.launch();
    const pool = new PagePool(browser, 5);
    await pool.init();

    const results = await Promise.all(
        urls.map(async (url) => {
            const page = await pool.getPage();
            try {
                await page.goto(url);
                const title = await page.title();
                return { url, title };
            } finally {
                pool.releasePage(page);
            }
        })
    );

    await pool.close();
    await browser.close();
    return results;
}

JavaScript: Concurrent Processing with Limits

For processing hundreds of URLs without overwhelming the system:

const puppeteer = require('puppeteer');

async function processBatch(urls, concurrency = 10) {
    const browser = await puppeteer.launch();
    const results = [];

    for (let i = 0; i < urls.length; i += concurrency) {
        const batch = urls.slice(i, i + concurrency);

        const batchResults = await Promise.all(
            batch.map(async (url) => {
                const page = await browser.newPage();
                try {
                    await page.goto(url, { waitUntil: 'networkidle2' });
                    const title = await page.title();
                    const screenshot = await page.screenshot({ type: 'png' });
                    return { url, title, screenshot };
                } catch (error) {
                    console.error(`Error processing ${url}:`, error.message);
                    return { url, error: error.message };
                } finally {
                    await page.close();
                }
            })
        );

        results.push(...batchResults);
        console.log(`Processed batch ${Math.floor(i/concurrency) + 1}`);
    }

    await browser.close();
    return results;
}

// Usage
const urls = Array.from({length: 50}, (_, i) => `https://example.com/page${i}`);
processBatch(urls, 5).then(console.log);

Python with Pyppeteer

For Python users, Pyppeteer provides similar functionality:

import asyncio
from pyppeteer import launch

async def scrape_page(browser, url):
    """Scrape a single page using a shared browser instance"""
    page = await browser.newPage()
    try:
        await page.goto(url)
        title = await page.title()
        content = await page.content()
        return {'url': url, 'title': title, 'content_length': len(content)}
    except Exception as e:
        return {'url': url, 'error': str(e)}
    finally:
        await page.close()

async def scrape_multiple_pages(urls):
    """Scrape multiple pages in parallel"""
    browser = await launch(headless=True)

    try:
        # Create tasks for all URLs
        tasks = [scrape_page(browser, url) for url in urls]

        # Execute all tasks in parallel
        results = await asyncio.gather(*tasks, return_exceptions=True)

        return results
    finally:
        await browser.close()

async def main():
    urls = [
        'https://example.com',
        'https://example.org',
        'https://example.net',
        'https://httpbin.org/delay/1',
        'https://httpbin.org/delay/2'
    ]

    results = await scrape_multiple_pages(urls)

    for result in results:
        if isinstance(result, dict) and 'error' not in result:
            print(f"✓ {result['title']} - {result['url']}")
        else:
            print(f"✗ Error: {result}")

# Run the scraper
asyncio.run(main())

Best Practices and Performance Tips

Resource Management

  • Use page pools for large-scale scraping (5-10 concurrent pages)
  • Limit concurrent pages to avoid memory issues
  • Reuse browser instances instead of creating new ones for each page
  • Close pages when done to free memory

Error Handling

async function robustScraping(urls) {
    const browser = await puppeteer.launch();
    const results = [];

    for (const url of urls) {
        const page = await browser.newPage();
        try {
            await page.goto(url, { 
                waitUntil: 'networkidle2',
                timeout: 30000 
            });

            const data = await page.evaluate(() => ({
                title: document.title,
                links: Array.from(document.links).length
            }));

            results.push({ url, success: true, data });
        } catch (error) {
            results.push({ url, success: false, error: error.message });
        } finally {
            await page.close();
        }

        // Small delay to avoid overwhelming the server
        await new Promise(resolve => setTimeout(resolve, 100));
    }

    await browser.close();
    return results;
}

Performance Optimization

  • Disable images and CSS for faster loading: await page.setRequestInterception(true)
  • Use networkidle2 instead of networkidle0 for better performance
  • Set appropriate timeouts to handle slow pages
  • Implement retry logic for failed requests

Running pages in parallel significantly reduces scraping time, but always consider the target server's capacity and implement proper rate limiting to avoid being blocked.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon