How do I handle file downloads with Puppeteer-Sharp?

Handling file downloads with Puppeteer-Sharp requires configuring the browser's download behavior and implementing proper download monitoring. This guide covers everything from basic setup to production-ready download handling.

Installation

First, install Puppeteer-Sharp in your .NET project:

dotnet add package PuppeteerSharp

Basic Download Setup

1. Configure Browser for Downloads

using System;
using System.IO;
using System.Threading.Tasks;
using PuppeteerSharp;

public class DownloadHandler
{
    public static async Task<string> DownloadFileAsync(string url, string selector, string downloadDirectory)
    {
        // Ensure download directory exists
        Directory.CreateDirectory(downloadDirectory);

        // Download Chromium if needed
        await new BrowserFetcher().DownloadAsync(BrowserFetcher.DefaultRevision);

        var browser = await Puppeteer.LaunchAsync(new LaunchOptions
        {
            Headless = true,
            Args = new[] { "--disable-web-security", "--disable-features=VizDisplayCompositor" }
        });

        var page = await browser.NewPageAsync();

        // Configure download behavior
        await page.Client.SendAsync("Page.setDownloadBehavior", new
        {
            behavior = "allow",
            downloadPath = downloadDirectory
        });

        try
        {
            // Navigate and trigger download
            await page.GoToAsync(url);
            await page.ClickAsync(selector);

            // Wait for download to complete
            var downloadedFile = await WaitForDownloadAsync(downloadDirectory);

            return downloadedFile;
        }
        finally
        {
            await browser.CloseAsync();
        }
    }
}

2. Monitor Download Completion

Instead of using Task.Delay(), implement proper download monitoring:

private static async Task<string> WaitForDownloadAsync(string downloadPath, int timeoutMs = 30000)
{
    var startTime = DateTime.UtcNow;
    var timeout = TimeSpan.FromMilliseconds(timeoutMs);

    while (DateTime.UtcNow - startTime < timeout)
    {
        var files = Directory.GetFiles(downloadPath);

        // Look for completed downloads (not .crdownload files)
        var completedFiles = files.Where(f => !f.EndsWith(".crdownload") && !f.EndsWith(".tmp"));

        if (completedFiles.Any())
        {
            // Return the most recently created file
            return completedFiles.OrderByDescending(f => File.GetCreationTime(f)).First();
        }

        await Task.Delay(500); // Check every 500ms
    }

    throw new TimeoutException($"Download did not complete within {timeoutMs}ms");
}

Advanced Download Scenarios

Multiple File Downloads

public static async Task DownloadMultipleFilesAsync(Dictionary<string, string> urlSelectorPairs, string downloadDirectory)
{
    var browser = await Puppeteer.LaunchAsync(new LaunchOptions { Headless = true });

    try
    {
        var downloadTasks = urlSelectorPairs.Select(async pair =>
        {
            var page = await browser.NewPageAsync();

            await page.Client.SendAsync("Page.setDownloadBehavior", new
            {
                behavior = "allow",
                downloadPath = downloadDirectory
            });

            await page.GoToAsync(pair.Key);
            await page.ClickAsync(pair.Value);

            // Each page can be closed after triggering download
            await page.CloseAsync();
        });

        await Task.WhenAll(downloadTasks);

        // Wait for all downloads to complete
        await WaitForMultipleDownloadsAsync(downloadDirectory, urlSelectorPairs.Count);
    }
    finally
    {
        await browser.CloseAsync();
    }
}

Form-Based Downloads

public static async Task DownloadFromFormAsync(string url, Dictionary<string, string> formData, string downloadDirectory)
{
    var browser = await Puppeteer.LaunchAsync(new LaunchOptions { Headless = true });
    var page = await browser.NewPageAsync();

    await page.Client.SendAsync("Page.setDownloadBehavior", new
    {
        behavior = "allow",
        downloadPath = downloadDirectory
    });

    try
    {
        await page.GoToAsync(url);

        // Fill form fields
        foreach (var field in formData)
        {
            await page.TypeAsync($"input[name='{field.Key}']", field.Value);
        }

        // Submit form and trigger download
        await page.ClickAsync("input[type='submit']");

        var downloadedFile = await WaitForDownloadAsync(downloadDirectory);
        Console.WriteLine($"Downloaded: {downloadedFile}");
    }
    finally
    {
        await browser.CloseAsync();
    }
}

Download Event Monitoring

For more precise control, you can monitor browser events:

public static async Task DownloadWithEventMonitoringAsync(string url, string selector, string downloadDirectory)
{
    var browser = await Puppeteer.LaunchAsync(new LaunchOptions { Headless = true });
    var page = await browser.NewPageAsync();

    var downloadStarted = false;
    var downloadCompleted = false;

    // Monitor download events
    await page.Client.SendAsync("Browser.setDownloadBehavior", new
    {
        behavior = "allow",
        downloadPath = downloadDirectory
    });

    // Listen for download progress
    page.Client.MessageReceived += (sender, e) =>
    {
        if (e.MessageID == "Browser.downloadWillBegin")
        {
            downloadStarted = true;
            Console.WriteLine("Download started");
        }
        else if (e.MessageID == "Browser.downloadProgress")
        {
            // Handle download progress updates
            var data = e.MessageData.ToObject<dynamic>();
            if (data.state == "completed")
            {
                downloadCompleted = true;
                Console.WriteLine("Download completed");
            }
        }
    };

    await page.GoToAsync(url);
    await page.ClickAsync(selector);

    // Wait for download to start and complete
    while (!downloadStarted || !downloadCompleted)
    {
        await Task.Delay(100);
    }

    await browser.CloseAsync();
}

Error Handling and Best Practices

public static async Task<DownloadResult> SafeDownloadAsync(string url, string selector, string downloadDirectory)
{
    var result = new DownloadResult();

    try
    {
        // Validate inputs
        if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(selector))
            throw new ArgumentException("URL and selector are required");

        Directory.CreateDirectory(downloadDirectory);

        var browser = await Puppeteer.LaunchAsync(new LaunchOptions
        {
            Headless = true,
            Timeout = 30000
        });

        var page = await browser.NewPageAsync();

        // Set timeouts
        page.DefaultTimeout = 30000;
        page.DefaultNavigationTimeout = 30000;

        await page.Client.SendAsync("Page.setDownloadBehavior", new
        {
            behavior = "allow",
            downloadPath = downloadDirectory
        });

        await page.GoToAsync(url, new NavigationOptions { WaitUntil = new[] { WaitUntilNavigation.Networkidle0 } });

        // Check if download element exists
        var element = await page.QuerySelectorAsync(selector);
        if (element == null)
        {
            result.Error = $"Download element not found: {selector}";
            return result;
        }

        await page.ClickAsync(selector);

        result.FilePath = await WaitForDownloadAsync(downloadDirectory);
        result.Success = true;

        await browser.CloseAsync();
    }
    catch (Exception ex)
    {
        result.Error = ex.Message;
        result.Success = false;
    }

    return result;
}

public class DownloadResult
{
    public bool Success { get; set; }
    public string FilePath { get; set; }
    public string Error { get; set; }
}

Key Points

  • Always configure download behavior before navigating to the download page
  • Use proper download monitoring instead of arbitrary delays
  • Handle timeouts gracefully for large files or slow networks
  • Validate download completion by checking file existence and size
  • Clean up resources by properly closing browser instances
  • Consider concurrent downloads for better performance with multiple files

This approach provides robust file download handling suitable for production environments while maintaining good performance and error handling.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon