Web scraping images with C# involves downloading HTML content, extracting image URLs, and processing the images using powerful libraries. This guide covers the complete workflow from setup to advanced techniques.
Required Dependencies
First, install the necessary NuGet packages:
dotnet add package HtmlAgilityPack
dotnet add package SixLabors.ImageSharp
dotnet add package System.Threading.Tasks.Extensions
Basic Image Scraping Example
Here's a comprehensive example that demonstrates the core concepts:
using System;
using System.Collections.Generic;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
using HtmlAgilityPack;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.Processing;
using SixLabors.ImageSharp.Formats.Jpeg;
public class ImageScraper
{
private readonly HttpClient _httpClient;
public ImageScraper()
{
_httpClient = new HttpClient();
_httpClient.DefaultRequestHeaders.Add("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
}
public async Task<List<string>> ScrapeImagesAsync(string url, string outputDirectory = "images")
{
var downloadedImages = new List<string>();
try
{
// Create output directory if it doesn't exist
Directory.CreateDirectory(outputDirectory);
// Download the web page
string html = await _httpClient.GetStringAsync(url);
// Parse HTML content
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
// Extract image URLs using XPath
var imageNodes = htmlDoc.DocumentNode.SelectNodes("//img[@src]");
if (imageNodes == null)
{
Console.WriteLine("No images found on the page.");
return downloadedImages;
}
Console.WriteLine($"Found {imageNodes.Count} images to process.");
foreach (var img in imageNodes)
{
string imageUrl = img.GetAttributeValue("src", "");
string altText = img.GetAttributeValue("alt", "");
if (!string.IsNullOrEmpty(imageUrl))
{
string filename = await DownloadAndProcessImageAsync(url, imageUrl, outputDirectory, altText);
if (!string.IsNullOrEmpty(filename))
{
downloadedImages.Add(filename);
}
}
}
}
catch (Exception ex)
{
Console.WriteLine($"Error scraping images: {ex.Message}");
}
return downloadedImages;
}
private async Task<string> DownloadAndProcessImageAsync(string baseUrl, string imageUrl,
string outputDirectory, string altText)
{
try
{
// Convert relative URLs to absolute
Uri absoluteUri = new Uri(new Uri(baseUrl), imageUrl);
// Download image
byte[] imageBytes = await _httpClient.GetByteArrayAsync(absoluteUri);
// Generate filename
string filename = GenerateFilename(absoluteUri, altText);
string filePath = Path.Combine(outputDirectory, filename);
// Save original image
await File.WriteAllBytesAsync(filePath, imageBytes);
// Process image (resize and optimize)
await ProcessImageAsync(imageBytes, filePath, outputDirectory);
Console.WriteLine($"Downloaded and processed: {filename}");
return filename;
}
catch (Exception ex)
{
Console.WriteLine($"Error downloading image {imageUrl}: {ex.Message}");
return null;
}
}
private async Task ProcessImageAsync(byte[] imageBytes, string originalPath, string outputDirectory)
{
try
{
using var image = Image.Load(imageBytes);
// Create thumbnail (300x300 max)
var thumbnailPath = Path.Combine(outputDirectory, $"thumb_{Path.GetFileName(originalPath)}");
using var thumbnail = image.Clone();
thumbnail.Mutate(x => x.Resize(new ResizeOptions
{
Size = new Size(300, 300),
Mode = ResizeMode.Max
}));
await thumbnail.SaveAsJpegAsync(thumbnailPath, new JpegEncoder { Quality = 85 });
// Create medium size (800px max width)
var mediumPath = Path.Combine(outputDirectory, $"medium_{Path.GetFileName(originalPath)}");
using var medium = image.Clone();
medium.Mutate(x => x.Resize(new ResizeOptions
{
Size = new Size(800, 0),
Mode = ResizeMode.Max
}));
await medium.SaveAsJpegAsync(mediumPath, new JpegEncoder { Quality = 90 });
}
catch (Exception ex)
{
Console.WriteLine($"Error processing image: {ex.Message}");
}
}
private string GenerateFilename(Uri imageUri, string altText)
{
string filename = Path.GetFileName(imageUri.LocalPath);
// If no filename or invalid, generate one from alt text or timestamp
if (string.IsNullOrEmpty(filename) || filename == "/")
{
if (!string.IsNullOrEmpty(altText))
{
filename = $"{altText.Replace(" ", "_")}.jpg";
}
else
{
filename = $"image_{DateTime.Now:yyyyMMdd_HHmmss}.jpg";
}
}
return filename;
}
public void Dispose()
{
_httpClient?.Dispose();
}
}
Advanced Image Processing Examples
1. Batch Processing with Different Operations
public async Task ProcessImagesWithVariousOperationsAsync(string inputDirectory)
{
var imageFiles = Directory.GetFiles(inputDirectory, "*.{jpg,jpeg,png,gif}", SearchOption.TopDirectoryOnly);
foreach (string imagePath in imageFiles)
{
try
{
using var image = await Image.LoadAsync(imagePath);
// Apply various transformations
image.Mutate(x => x
.Resize(800, 600)
.Grayscale()
.GaussianBlur(1.5f)
.Contrast(1.2f)
.Brightness(1.1f)
);
string outputPath = Path.Combine(inputDirectory, $"processed_{Path.GetFileName(imagePath)}");
await image.SaveAsJpegAsync(outputPath, new JpegEncoder { Quality = 90 });
}
catch (Exception ex)
{
Console.WriteLine($"Error processing {imagePath}: {ex.Message}");
}
}
}
2. Format Conversion and Optimization
public async Task ConvertAndOptimizeAsync(string imagePath, string outputDirectory)
{
using var image = await Image.LoadAsync(imagePath);
// Convert to WebP for web optimization
string webpPath = Path.Combine(outputDirectory, Path.ChangeExtension(Path.GetFileName(imagePath), ".webp"));
await image.SaveAsWebpAsync(webpPath);
// Convert to PNG with transparency preserved
string pngPath = Path.Combine(outputDirectory, Path.ChangeExtension(Path.GetFileName(imagePath), ".png"));
await image.SaveAsPngAsync(pngPath);
// High-quality JPEG
string jpegPath = Path.Combine(outputDirectory, Path.ChangeExtension(Path.GetFileName(imagePath), ".jpg"));
await image.SaveAsJpegAsync(jpegPath, new JpegEncoder { Quality = 95 });
}
Advanced Scraping Techniques
1. Handling Different Image Sources
public List<string> ExtractAllImageUrls(HtmlDocument document, string baseUrl)
{
var imageUrls = new List<string>();
// Standard img tags
var imgTags = document.DocumentNode.SelectNodes("//img[@src]");
if (imgTags != null)
{
imageUrls.AddRange(imgTags.Select(img => img.GetAttributeValue("src", "")));
}
// Background images in CSS
var elementsWithBackground = document.DocumentNode.SelectNodes("//*[@style]");
if (elementsWithBackground != null)
{
foreach (var element in elementsWithBackground)
{
string style = element.GetAttributeValue("style", "");
var match = System.Text.RegularExpressions.Regex.Match(style, @"background-image:\s*url\(['""]?([^'""]+)['""]?\)");
if (match.Success)
{
imageUrls.Add(match.Groups[1].Value);
}
}
}
// Data attributes (lazy loading)
var lazyImages = document.DocumentNode.SelectNodes("//img[@data-src]");
if (lazyImages != null)
{
imageUrls.AddRange(lazyImages.Select(img => img.GetAttributeValue("data-src", "")));
}
return imageUrls.Distinct().ToList();
}
2. Parallel Download with Rate Limiting
public async Task<List<string>> DownloadImagesParallelAsync(List<string> imageUrls, string baseUrl, int maxConcurrency = 3)
{
var semaphore = new SemaphoreSlim(maxConcurrency);
var downloadTasks = imageUrls.Select(async url =>
{
await semaphore.WaitAsync();
try
{
return await DownloadSingleImageAsync(baseUrl, url);
}
finally
{
semaphore.Release();
}
});
var results = await Task.WhenAll(downloadTasks);
return results.Where(r => !string.IsNullOrEmpty(r)).ToList();
}
Usage Example
class Program
{
static async Task Main(string[] args)
{
var scraper = new ImageScraper();
try
{
string targetUrl = "https://example.com/gallery";
var downloadedImages = await scraper.ScrapeImagesAsync(targetUrl, "downloaded_images");
Console.WriteLine($"Successfully downloaded {downloadedImages.Count} images.");
// Process images further if needed
await scraper.ProcessImagesWithVariousOperationsAsync("downloaded_images");
}
finally
{
scraper.Dispose();
}
}
}
Best Practices and Considerations
Legal and Ethical Guidelines
- Always check robots.txt (
website.com/robots.txt
) before scraping - Respect copyright and fair use policies - only download images you have permission to use
- Review terms of service of the target website
- Implement proper attribution when required
Technical Best Practices
- Use appropriate User-Agent headers to identify your scraper
- Implement rate limiting to avoid overwhelming servers (typically 1-2 requests per second)
- Handle different image formats (JPEG, PNG, WebP, SVG)
- Validate image content before processing to avoid corrupted files
- Use connection pooling with HttpClient for better performance
Error Handling and Resilience
- Implement retry logic for failed downloads
- Validate URLs before attempting downloads
- Handle HTTP status codes appropriately (404, 403, 429)
- Set reasonable timeouts for HTTP requests
- Log errors for debugging and monitoring
Performance Optimization
- Use async/await throughout for better scalability
- Implement concurrent downloads with proper throttling
- Cache frequently accessed images to avoid re-downloading
- Compress processed images for storage efficiency
- Consider using CDN for serving processed images
This comprehensive approach ensures your C# image scraping solution is robust, efficient, and respectful of web resources while providing powerful image processing capabilities.