How to handle file downloads in Puppeteer?

File downloads in Puppeteer require special handling since the browser automation tool doesn't natively support file downloads in headless mode. This guide covers multiple approaches to successfully download files.

Method 1: Direct Browser Downloads (Non-headless)

The most straightforward approach uses Chrome's built-in download functionality:

const puppeteer = require('puppeteer');
const path = require('path');

async function downloadWithBrowser() {
  const downloadPath = path.resolve('./downloads');

  const browser = await puppeteer.launch({
    headless: false, // Required for downloads
    defaultViewport: null,
  });

  const page = await browser.newPage();

  // Set download behavior
  await page._client.send('Page.setDownloadBehavior', {
    behavior: 'allow',
    downloadPath: downloadPath,
  });

  await page.goto('https://example.com');
  await page.click('#download-button');

  // Wait for download to complete (optional)
  await page.waitForTimeout(3000);

  await browser.close();
}

Method 2: HTTP Request Approach (Recommended)

Extract download URLs and use HTTP libraries for better control:

const puppeteer = require('puppeteer');
const axios = require('axios');
const fs = require('fs');
const path = require('path');

async function downloadWithHttp() {
  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();

  await page.goto('https://example.com');

  // Extract download URL
  const downloadUrl = await page.evaluate(() => {
    const link = document.querySelector('#download-button');
    return link ? link.href : null;
  });

  if (!downloadUrl) {
    throw new Error('Download link not found');
  }

  // Get cookies for authenticated downloads
  const cookies = await page.cookies();
  const cookieString = cookies
    .map(cookie => `${cookie.name}=${cookie.value}`)
    .join('; ');

  await browser.close();

  // Download file with proper headers
  const response = await axios({
    method: 'GET',
    url: downloadUrl,
    responseType: 'stream',
    headers: {
      'Cookie': cookieString,
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    },
  });

  const filename = getFilenameFromResponse(response) || 'download.file';
  const filepath = path.join('./downloads', filename);

  const writer = fs.createWriteStream(filepath);
  response.data.pipe(writer);

  return new Promise((resolve, reject) => {
    writer.on('finish', () => resolve(filepath));
    writer.on('error', reject);
  });
}

function getFilenameFromResponse(response) {
  const disposition = response.headers['content-disposition'];
  if (disposition && disposition.includes('filename=')) {
    return disposition.split('filename=')[1].replace(/"/g, '');
  }
  return null;
}

Method 3: Modern CDP Approach

Using Chrome DevTools Protocol directly for better reliability:

const puppeteer = require('puppeteer');
const fs = require('fs');

async function downloadWithCDP() {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Enable download events
  const client = await page.target().createCDPSession();
  await client.send('Page.enable');
  await client.send('Runtime.enable');

  const downloadPath = './downloads';

  // Set download behavior
  await client.send('Page.setDownloadBehavior', {
    behavior: 'allow',
    downloadPath: downloadPath,
  });

  // Listen for download events
  client.on('Page.downloadWillBegin', (event) => {
    console.log('Download started:', event.suggestedFilename);
  });

  client.on('Page.downloadProgress', (event) => {
    if (event.state === 'completed') {
      console.log('Download completed:', event.guid);
    }
  });

  await page.goto('https://example.com');
  await page.click('#download-button');

  // Wait for download completion
  await new Promise(resolve => setTimeout(resolve, 5000));

  await browser.close();
}

Handling Dynamic Downloads

For downloads triggered by JavaScript or requiring form submissions:

async function handleDynamicDownload() {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  // Intercept network requests
  await page.setRequestInterception(true);

  let downloadUrl = null;

  page.on('request', (request) => {
    const url = request.url();
    const headers = request.headers();

    // Detect download requests
    if (headers['content-type']?.includes('application/') || 
        url.includes('download') || 
        headers['content-disposition']) {
      downloadUrl = url;
    }

    request.continue();
  });

  await page.goto('https://example.com');

  // Fill form and trigger download
  await page.type('#email', 'user@example.com');
  await page.click('#submit-for-download');

  // Wait for download URL to be captured
  await page.waitForFunction(() => downloadUrl !== null, { timeout: 10000 });

  await browser.close();

  if (downloadUrl) {
    // Download using HTTP method
    await downloadFileFromUrl(downloadUrl);
  }
}

Best Practices

  1. Use HTTP approach for production: More reliable and works in headless mode
  2. Handle authentication: Transfer cookies and headers from Puppeteer session
  3. Validate file types: Check content-type headers before downloading
  4. Implement proper error handling: Network failures, file system errors
  5. Monitor download progress: For large files, implement progress tracking
// Complete example with error handling
async function robustDownload(url, selector) {
  let browser;
  try {
    browser = await puppeteer.launch({ headless: true });
    const page = await browser.newPage();

    await page.goto(url, { waitUntil: 'networkidle2' });

    const downloadUrl = await page.evaluate((sel) => {
      const element = document.querySelector(sel);
      return element ? element.href || element.src : null;
    }, selector);

    if (!downloadUrl) {
      throw new Error(`Download element not found: ${selector}`);
    }

    const cookies = await page.cookies();
    const filepath = await downloadFileWithAuth(downloadUrl, cookies);

    console.log(`File downloaded successfully: ${filepath}`);
    return filepath;

  } catch (error) {
    console.error('Download failed:', error.message);
    throw error;
  } finally {
    if (browser) {
      await browser.close();
    }
  }
}

The HTTP request approach is generally recommended for production applications as it provides better control, works in headless mode, and handles authentication more reliably than browser-based downloads.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon