Table of contents

How can I use C# to scrape and process images from the web?

Web scraping images with C# involves downloading HTML content, extracting image URLs, and processing the images using powerful libraries. This guide covers the complete workflow from setup to advanced techniques.

Required Dependencies

First, install the necessary NuGet packages:

dotnet add package HtmlAgilityPack
dotnet add package SixLabors.ImageSharp
dotnet add package System.Threading.Tasks.Extensions

Basic Image Scraping Example

Here's a comprehensive example that demonstrates the core concepts:

using System;
using System.Collections.Generic;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
using HtmlAgilityPack;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.Processing;
using SixLabors.ImageSharp.Formats.Jpeg;

public class ImageScraper
{
    private readonly HttpClient _httpClient;

    public ImageScraper()
    {
        _httpClient = new HttpClient();
        _httpClient.DefaultRequestHeaders.Add("User-Agent", 
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
    }

    public async Task<List<string>> ScrapeImagesAsync(string url, string outputDirectory = "images")
    {
        var downloadedImages = new List<string>();

        try
        {
            // Create output directory if it doesn't exist
            Directory.CreateDirectory(outputDirectory);

            // Download the web page
            string html = await _httpClient.GetStringAsync(url);

            // Parse HTML content
            var htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);

            // Extract image URLs using XPath
            var imageNodes = htmlDoc.DocumentNode.SelectNodes("//img[@src]");

            if (imageNodes == null)
            {
                Console.WriteLine("No images found on the page.");
                return downloadedImages;
            }

            Console.WriteLine($"Found {imageNodes.Count} images to process.");

            foreach (var img in imageNodes)
            {
                string imageUrl = img.GetAttributeValue("src", "");
                string altText = img.GetAttributeValue("alt", "");

                if (!string.IsNullOrEmpty(imageUrl))
                {
                    string filename = await DownloadAndProcessImageAsync(url, imageUrl, outputDirectory, altText);
                    if (!string.IsNullOrEmpty(filename))
                    {
                        downloadedImages.Add(filename);
                    }
                }
            }
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error scraping images: {ex.Message}");
        }

        return downloadedImages;
    }

    private async Task<string> DownloadAndProcessImageAsync(string baseUrl, string imageUrl, 
        string outputDirectory, string altText)
    {
        try
        {
            // Convert relative URLs to absolute
            Uri absoluteUri = new Uri(new Uri(baseUrl), imageUrl);

            // Download image
            byte[] imageBytes = await _httpClient.GetByteArrayAsync(absoluteUri);

            // Generate filename
            string filename = GenerateFilename(absoluteUri, altText);
            string filePath = Path.Combine(outputDirectory, filename);

            // Save original image
            await File.WriteAllBytesAsync(filePath, imageBytes);

            // Process image (resize and optimize)
            await ProcessImageAsync(imageBytes, filePath, outputDirectory);

            Console.WriteLine($"Downloaded and processed: {filename}");
            return filename;
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error downloading image {imageUrl}: {ex.Message}");
            return null;
        }
    }

    private async Task ProcessImageAsync(byte[] imageBytes, string originalPath, string outputDirectory)
    {
        try
        {
            using var image = Image.Load(imageBytes);

            // Create thumbnail (300x300 max)
            var thumbnailPath = Path.Combine(outputDirectory, $"thumb_{Path.GetFileName(originalPath)}");
            using var thumbnail = image.Clone();
            thumbnail.Mutate(x => x.Resize(new ResizeOptions
            {
                Size = new Size(300, 300),
                Mode = ResizeMode.Max
            }));
            await thumbnail.SaveAsJpegAsync(thumbnailPath, new JpegEncoder { Quality = 85 });

            // Create medium size (800px max width)
            var mediumPath = Path.Combine(outputDirectory, $"medium_{Path.GetFileName(originalPath)}");
            using var medium = image.Clone();
            medium.Mutate(x => x.Resize(new ResizeOptions
            {
                Size = new Size(800, 0),
                Mode = ResizeMode.Max
            }));
            await medium.SaveAsJpegAsync(mediumPath, new JpegEncoder { Quality = 90 });
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error processing image: {ex.Message}");
        }
    }

    private string GenerateFilename(Uri imageUri, string altText)
    {
        string filename = Path.GetFileName(imageUri.LocalPath);

        // If no filename or invalid, generate one from alt text or timestamp
        if (string.IsNullOrEmpty(filename) || filename == "/")
        {
            if (!string.IsNullOrEmpty(altText))
            {
                filename = $"{altText.Replace(" ", "_")}.jpg";
            }
            else
            {
                filename = $"image_{DateTime.Now:yyyyMMdd_HHmmss}.jpg";
            }
        }

        return filename;
    }

    public void Dispose()
    {
        _httpClient?.Dispose();
    }
}

Advanced Image Processing Examples

1. Batch Processing with Different Operations

public async Task ProcessImagesWithVariousOperationsAsync(string inputDirectory)
{
    var imageFiles = Directory.GetFiles(inputDirectory, "*.{jpg,jpeg,png,gif}", SearchOption.TopDirectoryOnly);

    foreach (string imagePath in imageFiles)
    {
        try
        {
            using var image = await Image.LoadAsync(imagePath);

            // Apply various transformations
            image.Mutate(x => x
                .Resize(800, 600)
                .Grayscale()
                .GaussianBlur(1.5f)
                .Contrast(1.2f)
                .Brightness(1.1f)
            );

            string outputPath = Path.Combine(inputDirectory, $"processed_{Path.GetFileName(imagePath)}");
            await image.SaveAsJpegAsync(outputPath, new JpegEncoder { Quality = 90 });
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error processing {imagePath}: {ex.Message}");
        }
    }
}

2. Format Conversion and Optimization

public async Task ConvertAndOptimizeAsync(string imagePath, string outputDirectory)
{
    using var image = await Image.LoadAsync(imagePath);

    // Convert to WebP for web optimization
    string webpPath = Path.Combine(outputDirectory, Path.ChangeExtension(Path.GetFileName(imagePath), ".webp"));
    await image.SaveAsWebpAsync(webpPath);

    // Convert to PNG with transparency preserved
    string pngPath = Path.Combine(outputDirectory, Path.ChangeExtension(Path.GetFileName(imagePath), ".png"));
    await image.SaveAsPngAsync(pngPath);

    // High-quality JPEG
    string jpegPath = Path.Combine(outputDirectory, Path.ChangeExtension(Path.GetFileName(imagePath), ".jpg"));
    await image.SaveAsJpegAsync(jpegPath, new JpegEncoder { Quality = 95 });
}

Advanced Scraping Techniques

1. Handling Different Image Sources

public List<string> ExtractAllImageUrls(HtmlDocument document, string baseUrl)
{
    var imageUrls = new List<string>();

    // Standard img tags
    var imgTags = document.DocumentNode.SelectNodes("//img[@src]");
    if (imgTags != null)
    {
        imageUrls.AddRange(imgTags.Select(img => img.GetAttributeValue("src", "")));
    }

    // Background images in CSS
    var elementsWithBackground = document.DocumentNode.SelectNodes("//*[@style]");
    if (elementsWithBackground != null)
    {
        foreach (var element in elementsWithBackground)
        {
            string style = element.GetAttributeValue("style", "");
            var match = System.Text.RegularExpressions.Regex.Match(style, @"background-image:\s*url\(['""]?([^'""]+)['""]?\)");
            if (match.Success)
            {
                imageUrls.Add(match.Groups[1].Value);
            }
        }
    }

    // Data attributes (lazy loading)
    var lazyImages = document.DocumentNode.SelectNodes("//img[@data-src]");
    if (lazyImages != null)
    {
        imageUrls.AddRange(lazyImages.Select(img => img.GetAttributeValue("data-src", "")));
    }

    return imageUrls.Distinct().ToList();
}

2. Parallel Download with Rate Limiting

public async Task<List<string>> DownloadImagesParallelAsync(List<string> imageUrls, string baseUrl, int maxConcurrency = 3)
{
    var semaphore = new SemaphoreSlim(maxConcurrency);
    var downloadTasks = imageUrls.Select(async url =>
    {
        await semaphore.WaitAsync();
        try
        {
            return await DownloadSingleImageAsync(baseUrl, url);
        }
        finally
        {
            semaphore.Release();
        }
    });

    var results = await Task.WhenAll(downloadTasks);
    return results.Where(r => !string.IsNullOrEmpty(r)).ToList();
}

Usage Example

class Program
{
    static async Task Main(string[] args)
    {
        var scraper = new ImageScraper();

        try
        {
            string targetUrl = "https://example.com/gallery";
            var downloadedImages = await scraper.ScrapeImagesAsync(targetUrl, "downloaded_images");

            Console.WriteLine($"Successfully downloaded {downloadedImages.Count} images.");

            // Process images further if needed
            await scraper.ProcessImagesWithVariousOperationsAsync("downloaded_images");
        }
        finally
        {
            scraper.Dispose();
        }
    }
}

Best Practices and Considerations

Legal and Ethical Guidelines

  • Always check robots.txt (website.com/robots.txt) before scraping
  • Respect copyright and fair use policies - only download images you have permission to use
  • Review terms of service of the target website
  • Implement proper attribution when required

Technical Best Practices

  • Use appropriate User-Agent headers to identify your scraper
  • Implement rate limiting to avoid overwhelming servers (typically 1-2 requests per second)
  • Handle different image formats (JPEG, PNG, WebP, SVG)
  • Validate image content before processing to avoid corrupted files
  • Use connection pooling with HttpClient for better performance

Error Handling and Resilience

  • Implement retry logic for failed downloads
  • Validate URLs before attempting downloads
  • Handle HTTP status codes appropriately (404, 403, 429)
  • Set reasonable timeouts for HTTP requests
  • Log errors for debugging and monitoring

Performance Optimization

  • Use async/await throughout for better scalability
  • Implement concurrent downloads with proper throttling
  • Cache frequently accessed images to avoid re-downloading
  • Compress processed images for storage efficiency
  • Consider using CDN for serving processed images

This comprehensive approach ensures your C# image scraping solution is robust, efficient, and respectful of web resources while providing powerful image processing capabilities.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon