Effective error handling is crucial when building reliable web scraping applications with Puppeteer-Sharp. This guide covers comprehensive best practices to handle exceptions, timeouts, and edge cases that commonly occur during browser automation tasks.
1. Structured Exception Handling with Try-Catch Blocks
Always wrap Puppeteer-Sharp operations in try-catch blocks to handle exceptions gracefully. Order catch blocks from most specific to least specific:
try
{
await using var browser = await Puppeteer.LaunchAsync(new LaunchOptions
{
Headless = true,
Args = new[] { "--no-sandbox", "--disable-setuid-sandbox" }
});
await using var page = await browser.NewPageAsync();
await page.GoToAsync("https://example.com");
var title = await page.GetTitleAsync();
Console.WriteLine($"Page title: {title}");
}
catch (NavigationException ex)
{
// Handle navigation-specific errors
Console.WriteLine($"Navigation failed: {ex.Message}");
}
catch (TimeoutException ex)
{
// Handle timeout errors specifically
Console.WriteLine($"Operation timed out: {ex.Message}");
}
catch (PuppeteerException ex)
{
// Handle other Puppeteer-specific exceptions
Console.WriteLine($"Puppeteer error: {ex.Message}");
}
catch (Exception ex)
{
// Handle unexpected errors
Console.WriteLine($"Unexpected error: {ex.Message}");
throw; // Re-throw if you can't handle it
}
2. Specific Exception Types and Handling
Puppeteer-Sharp throws various exception types. Handle each appropriately:
try
{
await page.EvaluateExpressionAsync("document.querySelector('.missing-element').click()");
}
catch (EvaluationFailedException ex)
{
// Handle JavaScript evaluation errors
Console.WriteLine($"JavaScript evaluation failed: {ex.Message}");
// Return default value or take alternative action
}
catch (ElementHandleException ex)
{
// Handle element interaction errors
Console.WriteLine($"Element interaction failed: {ex.Message}");
}
catch (ProtocolException ex)
{
// Handle Chrome DevTools Protocol errors
Console.WriteLine($"Protocol error: {ex.Message}");
// May need to restart browser
}
3. Robust Timeout Handling
Configure appropriate timeouts and handle timeout scenarios:
public async Task<bool> NavigateWithRetryAsync(IPage page, string url, int maxRetries = 3)
{
for (int attempt = 1; attempt <= maxRetries; attempt++)
{
try
{
await page.GoToAsync(url, new NavigationOptions
{
Timeout = 30000, // 30 seconds
WaitUntil = new[] { WaitUntilNavigation.Networkidle0 }
});
return true;
}
catch (TimeoutException ex)
{
Console.WriteLine($"Attempt {attempt}: Navigation timeout - {ex.Message}");
if (attempt == maxRetries)
{
Console.WriteLine("Max retries reached. Navigation failed.");
return false;
}
await Task.Delay(2000 * attempt); // Progressive delay
}
}
return false;
}
4. Element Waiting and Interaction Safety
Implement safe element interactions with proper waiting strategies:
public async Task<string> SafelyExtractTextAsync(IPage page, string selector, int timeoutMs = 10000)
{
try
{
// Wait for element to be available
await page.WaitForSelectorAsync(selector, new WaitForSelectorOptions
{
Timeout = timeoutMs,
Visible = true
});
// Double-check element exists before interaction
var element = await page.QuerySelectorAsync(selector);
if (element == null)
{
throw new InvalidOperationException($"Element with selector '{selector}' not found");
}
return await element.EvaluateFunctionAsync<string>("el => el.textContent?.trim() || ''");
}
catch (WaitTaskTimeoutException ex)
{
Console.WriteLine($"Element '{selector}' not found within {timeoutMs}ms: {ex.Message}");
return string.Empty; // Return safe default
}
catch (Exception ex)
{
Console.WriteLine($"Error extracting text from '{selector}': {ex.Message}");
return string.Empty;
}
}
5. Comprehensive Resource Management
Ensure proper cleanup of browser resources, even when exceptions occur:
public class PuppeteerManager : IAsyncDisposable
{
private IBrowser? _browser;
public async Task<IBrowser> GetBrowserAsync()
{
if (_browser == null)
{
try
{
_browser = await Puppeteer.LaunchAsync(new LaunchOptions
{
Headless = true,
Args = new[] { "--no-sandbox", "--disable-dev-shm-usage" }
});
}
catch (Exception ex)
{
Console.WriteLine($"Failed to launch browser: {ex.Message}");
throw;
}
}
return _browser;
}
public async ValueTask DisposeAsync()
{
if (_browser != null)
{
try
{
await _browser.CloseAsync();
}
catch (Exception ex)
{
Console.WriteLine($"Error closing browser: {ex.Message}");
}
finally
{
_browser?.Dispose();
_browser = null;
}
}
}
}
6. Advanced Retry Strategies with Exponential Backoff
Implement sophisticated retry logic for transient failures:
public static async Task<T> RetryWithExponentialBackoffAsync<T>(
Func<Task<T>> operation,
int maxRetries = 3,
TimeSpan? baseDelay = null,
Func<Exception, bool>? shouldRetry = null)
{
var delay = baseDelay ?? TimeSpan.FromSeconds(1);
shouldRetry ??= ex => ex is TimeoutException || ex is NavigationException;
Exception lastException = null;
for (int attempt = 0; attempt <= maxRetries; attempt++)
{
try
{
return await operation();
}
catch (Exception ex) when (attempt < maxRetries && shouldRetry(ex))
{
lastException = ex;
var currentDelay = TimeSpan.FromMilliseconds(delay.TotalMilliseconds * Math.Pow(2, attempt));
Console.WriteLine($"Attempt {attempt + 1} failed: {ex.Message}. Retrying in {currentDelay.TotalSeconds}s...");
await Task.Delay(currentDelay);
}
}
throw new InvalidOperationException($"Operation failed after {maxRetries + 1} attempts", lastException);
}
7. Structured Logging and Error Monitoring
Implement comprehensive logging for debugging and monitoring:
public class PuppeteerLogger
{
private readonly ILogger _logger;
public PuppeteerLogger(ILogger logger)
{
_logger = logger;
}
public void LogException(Exception ex, string operation, Dictionary<string, object>? context = null)
{
var logData = new Dictionary<string, object>
{
["Operation"] = operation,
["ExceptionType"] = ex.GetType().Name,
["Message"] = ex.Message,
["StackTrace"] = ex.StackTrace
};
if (context != null)
{
foreach (var kvp in context)
{
logData[kvp.Key] = kvp.Value;
}
}
_logger.LogError(ex, "Puppeteer operation failed: {Operation}", operation);
}
}
// Usage example
try
{
await page.GoToAsync(url);
}
catch (Exception ex)
{
logger.LogException(ex, "PageNavigation", new Dictionary<string, object>
{
["Url"] = url,
["UserAgent"] = await page.EvaluateExpressionAsync<string>("navigator.userAgent"),
["Viewport"] = page.Viewport
});
throw;
}
8. Data Validation and Sanitization
Validate extracted data to prevent downstream errors:
public static class DataValidator
{
public static string ValidateAndSanitizeText(string input, string fieldName, int maxLength = 1000)
{
if (string.IsNullOrWhiteSpace(input))
{
throw new ArgumentException($"{fieldName} cannot be null or empty");
}
var sanitized = input.Trim();
if (sanitized.Length > maxLength)
{
Console.WriteLine($"Warning: {fieldName} truncated from {sanitized.Length} to {maxLength} characters");
sanitized = sanitized[..maxLength];
}
return sanitized;
}
public static Uri ValidateUrl(string url, string fieldName)
{
if (!Uri.TryCreate(url, UriKind.Absolute, out var validUri))
{
throw new ArgumentException($"{fieldName} is not a valid URL: {url}");
}
if (validUri.Scheme != "http" && validUri.Scheme != "https")
{
throw new ArgumentException($"{fieldName} must use HTTP or HTTPS: {url}");
}
return validUri;
}
}
9. Circuit Breaker Pattern for Failing Domains
Implement circuit breaker pattern to handle consistently failing websites:
public class DomainCircuitBreaker
{
private readonly Dictionary<string, CircuitState> _circuits = new();
private readonly TimeSpan _timeout = TimeSpan.FromMinutes(5);
private readonly int _failureThreshold = 5;
public async Task<T> ExecuteAsync<T>(string domain, Func<Task<T>> operation)
{
var circuit = GetOrCreateCircuit(domain);
if (circuit.State == CircuitBreakerState.Open)
{
if (DateTime.UtcNow - circuit.LastFailure < _timeout)
{
throw new InvalidOperationException($"Circuit breaker is open for domain: {domain}");
}
circuit.State = CircuitBreakerState.HalfOpen;
}
try
{
var result = await operation();
circuit.Reset();
return result;
}
catch (Exception)
{
circuit.RecordFailure();
if (circuit.FailureCount >= _failureThreshold)
{
circuit.State = CircuitBreakerState.Open;
}
throw;
}
}
private CircuitState GetOrCreateCircuit(string domain)
{
if (!_circuits.TryGetValue(domain, out var circuit))
{
circuit = new CircuitState();
_circuits[domain] = circuit;
}
return circuit;
}
}
10. Health Checks and Browser State Monitoring
Monitor browser health and implement recovery mechanisms:
public async Task<bool> IsBrowserHealthyAsync(IBrowser browser)
{
try
{
var pages = await browser.PagesAsync();
if (pages.Length == 0)
{
// Create a test page to verify browser is responsive
await using var testPage = await browser.NewPageAsync();
await testPage.GoToAsync("about:blank", new NavigationOptions { Timeout = 5000 });
return true;
}
// Test if we can create a new page
await using var newPage = await browser.NewPageAsync();
return true;
}
catch (Exception ex)
{
Console.WriteLine($"Browser health check failed: {ex.Message}");
return false;
}
}
By implementing these comprehensive error handling practices, your Puppeteer-Sharp applications will be more resilient, maintainable, and capable of handling the unpredictable nature of web scraping environments. Remember to always test your error handling logic with various failure scenarios to ensure robust operation in production.