Table of contents

How do I handle file downloads with Headless Chromium?

Handling file downloads with headless Chromium requires special configuration since headless browsers lack a user interface for download dialogs. This guide shows you how to automate file downloads using popular tools like Puppeteer, Selenium, and Playwright.

Using Puppeteer (JavaScript)

Puppeteer provides the most straightforward approach for handling downloads in headless Chrome:

Basic Download Setup

const puppeteer = require('puppeteer');
const path = require('path');
const fs = require('fs');

(async () => {
  const browser = await puppeteer.launch({
    headless: true
  });

  const page = await browser.newPage();
  const downloadPath = path.resolve('./downloads');

  // Ensure download directory exists
  if (!fs.existsSync(downloadPath)) {
    fs.mkdirSync(downloadPath, { recursive: true });
  }

  // Configure download behavior
  await page._client.send('Page.setDownloadBehavior', {
    behavior: 'allow',
    downloadPath: downloadPath
  });

  await page.goto('https://example.com/download-page');

  // Trigger download
  await page.click('a[href$=".pdf"]'); // Example: PDF download link

  await browser.close();
})();

Advanced Download Monitoring

For production use, implement proper download completion detection:

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');

async function downloadFile(url, selector, filename) {
  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();
  const downloadPath = path.resolve('./downloads');

  // Setup download directory
  if (!fs.existsSync(downloadPath)) {
    fs.mkdirSync(downloadPath, { recursive: true });
  }

  await page._client.send('Page.setDownloadBehavior', {
    behavior: 'allow',
    downloadPath: downloadPath
  });

  await page.goto(url, { waitUntil: 'networkidle2' });

  // Wait for download to start
  const downloadPromise = new Promise((resolve) => {
    page._client.on('Browser.downloadProgress', (event) => {
      if (event.state === 'completed') {
        resolve(event);
      }
    });
  });

  // Trigger download
  await page.click(selector);

  // Wait for download completion
  const download = await downloadPromise;
  console.log('Download completed:', download.url);

  await browser.close();
  return path.join(downloadPath, filename);
}

// Usage
downloadFile('https://example.com', 'a[download]', 'myfile.pdf')
  .then(filePath => console.log('File saved:', filePath))
  .catch(err => console.error('Download failed:', err));

Using Selenium with Python

Selenium requires Chrome preferences configuration for headless downloads:

Basic Setup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import time

def setup_chrome_driver(download_dir):
    """Configure Chrome driver for headless downloads"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Create download directory if it doesn't exist
    os.makedirs(download_dir, exist_ok=True)

    # Configure download preferences
    prefs = {
        "download.default_directory": os.path.abspath(download_dir),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
        "safebrowsing.disable_download_protection": True
    }
    chrome_options.add_experimental_option("prefs", prefs)

    return webdriver.Chrome(options=chrome_options)

# Usage example
download_directory = "./downloads"
driver = setup_chrome_driver(download_directory)

try:
    driver.get('https://example.com/download-page')

    # Wait for download link to be clickable
    download_link = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href$=".pdf"]'))
    )
    download_link.click()

    # Wait for download to complete (basic approach)
    time.sleep(5)

finally:
    driver.quit()

Advanced Download Monitoring with Selenium

import os
import time
import glob
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def wait_for_download_completion(download_dir, timeout=30):
    """Wait for download to complete by monitoring .crdownload files"""
    end_time = time.time() + timeout

    while time.time() < end_time:
        # Check for Chrome's temporary download files
        downloading_files = glob.glob(os.path.join(download_dir, "*.crdownload"))
        if not downloading_files:
            # No active downloads, check if any files were downloaded
            files = os.listdir(download_dir)
            if files:
                return os.path.join(download_dir, files[-1])  # Return latest file
        time.sleep(1)

    raise TimeoutError(f"Download did not complete within {timeout} seconds")

def download_file_selenium(url, selector, download_dir="./downloads"):
    """Download file using Selenium with completion monitoring"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Ensure download directory exists
    os.makedirs(download_dir, exist_ok=True)

    prefs = {
        "download.default_directory": os.path.abspath(download_dir),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": False
    }
    chrome_options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=chrome_options)

    try:
        # Clear download directory
        for file in os.listdir(download_dir):
            os.remove(os.path.join(download_dir, file))

        driver.get(url)
        driver.find_element(By.CSS_SELECTOR, selector).click()

        # Wait for download completion
        downloaded_file = wait_for_download_completion(download_dir)
        print(f"Downloaded: {downloaded_file}")
        return downloaded_file

    finally:
        driver.quit()

# Usage
download_file_selenium('https://example.com', 'a[download]')

Using Playwright (Alternative)

Playwright offers a more modern approach with built-in download handling:

const { chromium } = require('playwright');

(async () => {
  const browser = await chromium.launch({ headless: true });
  const page = await browser.newPage();

  // Start waiting for download before clicking
  const downloadPromise = page.waitForEvent('download');

  await page.goto('https://example.com/download-page');
  await page.click('text=Download');

  const download = await downloadPromise;

  // Save to specific location
  await download.saveAs('./downloads/' + download.suggestedFilename());
  console.log('Download completed:', download.suggestedFilename());

  await browser.close();
})();

Best Practices

1. Directory Management

Always ensure your download directory exists and has proper permissions:

const fs = require('fs');
const downloadDir = './downloads';

if (!fs.existsSync(downloadDir)) {
  fs.mkdirSync(downloadDir, { recursive: true });
}

2. Download Completion Detection

Instead of arbitrary timeouts, monitor file system changes:

import os
import time

def wait_for_file(filepath, timeout=30):
    """Wait for file to appear and stop growing"""
    end_time = time.time() + timeout

    while time.time() < end_time:
        if os.path.exists(filepath):
            # File exists, wait for it to stop growing
            initial_size = os.path.getsize(filepath)
            time.sleep(2)
            if os.path.getsize(filepath) == initial_size:
                return True
        time.sleep(1)

    return False

3. Error Handling

Implement robust error handling for network issues and failed downloads:

try {
  await page.click(downloadSelector);
  await downloadPromise;
} catch (error) {
  console.error('Download failed:', error.message);
  // Implement retry logic or alternative download method
}

4. File Validation

Verify downloaded files are complete and valid:

def validate_download(filepath, expected_min_size=1024):
    """Validate downloaded file"""
    if not os.path.exists(filepath):
        return False, "File not found"

    file_size = os.path.getsize(filepath)
    if file_size < expected_min_size:
        return False, f"File too small: {file_size} bytes"

    # Additional validation based on file type
    return True, "Valid"

Common Issues and Solutions

  • Permission Errors: Ensure the download directory has write permissions
  • Network Timeouts: Increase timeout values for large files
  • Blocked Downloads: Some sites block automated downloads; consider user-agent spoofing
  • File Corruption: Always validate downloaded files before processing

This comprehensive approach ensures reliable file downloads in headless Chromium environments while handling edge cases and providing proper monitoring capabilities.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon