File downloads in Puppeteer require special handling since the browser automation tool doesn't natively support file downloads in headless mode. This guide covers multiple approaches to successfully download files.
Method 1: Direct Browser Downloads (Non-headless)
The most straightforward approach uses Chrome's built-in download functionality:
const puppeteer = require('puppeteer');
const path = require('path');
async function downloadWithBrowser() {
const downloadPath = path.resolve('./downloads');
const browser = await puppeteer.launch({
headless: false, // Required for downloads
defaultViewport: null,
});
const page = await browser.newPage();
// Set download behavior
await page._client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath,
});
await page.goto('https://example.com');
await page.click('#download-button');
// Wait for download to complete (optional)
await page.waitForTimeout(3000);
await browser.close();
}
Method 2: HTTP Request Approach (Recommended)
Extract download URLs and use HTTP libraries for better control:
const puppeteer = require('puppeteer');
const axios = require('axios');
const fs = require('fs');
const path = require('path');
async function downloadWithHttp() {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto('https://example.com');
// Extract download URL
const downloadUrl = await page.evaluate(() => {
const link = document.querySelector('#download-button');
return link ? link.href : null;
});
if (!downloadUrl) {
throw new Error('Download link not found');
}
// Get cookies for authenticated downloads
const cookies = await page.cookies();
const cookieString = cookies
.map(cookie => `${cookie.name}=${cookie.value}`)
.join('; ');
await browser.close();
// Download file with proper headers
const response = await axios({
method: 'GET',
url: downloadUrl,
responseType: 'stream',
headers: {
'Cookie': cookieString,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
},
});
const filename = getFilenameFromResponse(response) || 'download.file';
const filepath = path.join('./downloads', filename);
const writer = fs.createWriteStream(filepath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on('finish', () => resolve(filepath));
writer.on('error', reject);
});
}
function getFilenameFromResponse(response) {
const disposition = response.headers['content-disposition'];
if (disposition && disposition.includes('filename=')) {
return disposition.split('filename=')[1].replace(/"/g, '');
}
return null;
}
Method 3: Modern CDP Approach
Using Chrome DevTools Protocol directly for better reliability:
const puppeteer = require('puppeteer');
const fs = require('fs');
async function downloadWithCDP() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Enable download events
const client = await page.target().createCDPSession();
await client.send('Page.enable');
await client.send('Runtime.enable');
const downloadPath = './downloads';
// Set download behavior
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath,
});
// Listen for download events
client.on('Page.downloadWillBegin', (event) => {
console.log('Download started:', event.suggestedFilename);
});
client.on('Page.downloadProgress', (event) => {
if (event.state === 'completed') {
console.log('Download completed:', event.guid);
}
});
await page.goto('https://example.com');
await page.click('#download-button');
// Wait for download completion
await new Promise(resolve => setTimeout(resolve, 5000));
await browser.close();
}
Handling Dynamic Downloads
For downloads triggered by JavaScript or requiring form submissions:
async function handleDynamicDownload() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Intercept network requests
await page.setRequestInterception(true);
let downloadUrl = null;
page.on('request', (request) => {
const url = request.url();
const headers = request.headers();
// Detect download requests
if (headers['content-type']?.includes('application/') ||
url.includes('download') ||
headers['content-disposition']) {
downloadUrl = url;
}
request.continue();
});
await page.goto('https://example.com');
// Fill form and trigger download
await page.type('#email', 'user@example.com');
await page.click('#submit-for-download');
// Wait for download URL to be captured
await page.waitForFunction(() => downloadUrl !== null, { timeout: 10000 });
await browser.close();
if (downloadUrl) {
// Download using HTTP method
await downloadFileFromUrl(downloadUrl);
}
}
Best Practices
- Use HTTP approach for production: More reliable and works in headless mode
- Handle authentication: Transfer cookies and headers from Puppeteer session
- Validate file types: Check content-type headers before downloading
- Implement proper error handling: Network failures, file system errors
- Monitor download progress: For large files, implement progress tracking
// Complete example with error handling
async function robustDownload(url, selector) {
let browser;
try {
browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
const downloadUrl = await page.evaluate((sel) => {
const element = document.querySelector(sel);
return element ? element.href || element.src : null;
}, selector);
if (!downloadUrl) {
throw new Error(`Download element not found: ${selector}`);
}
const cookies = await page.cookies();
const filepath = await downloadFileWithAuth(downloadUrl, cookies);
console.log(`File downloaded successfully: ${filepath}`);
return filepath;
} catch (error) {
console.error('Download failed:', error.message);
throw error;
} finally {
if (browser) {
await browser.close();
}
}
}
The HTTP request approach is generally recommended for production applications as it provides better control, works in headless mode, and handles authentication more reliably than browser-based downloads.