How do I implement rate limiting in my C# web scraping tool?

Rate limiting is crucial for responsible web scraping in C#. It prevents server overload, respects website policies, and reduces the risk of IP blocking. Here are several effective approaches to implement rate limiting in your C# web scraping tool.

Method 1: Using SemaphoreSlim for Concurrent Request Control

The most efficient approach for modern C# applications uses SemaphoreSlim to control request concurrency:

using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;

public class RateLimitedScraper
{
    private readonly HttpClient _httpClient;
    private readonly SemaphoreSlim _semaphore;
    private readonly int _delayBetweenRequests;

    public RateLimitedScraper(int maxConcurrentRequests = 1, int delayMs = 1000)
    {
        _httpClient = new HttpClient();
        _semaphore = new SemaphoreSlim(maxConcurrentRequests, maxConcurrentRequests);
        _delayBetweenRequests = delayMs;
    }

    public async Task<string> ScrapeUrlAsync(string url)
    {
        await _semaphore.WaitAsync();
        try
        {
            var response = await _httpClient.GetAsync(url);
            response.EnsureSuccessStatusCode();

            await Task.Delay(_delayBetweenRequests);
            return await response.Content.ReadAsStringAsync();
        }
        finally
        {
            _semaphore.Release();
        }
    }

    public async Task<List<string>> ScrapeMultipleUrlsAsync(IEnumerable<string> urls)
    {
        var tasks = new List<Task<string>>();

        foreach (var url in urls)
        {
            tasks.Add(ScrapeUrlAsync(url));
        }

        return new List<string>(await Task.WhenAll(tasks));
    }

    public void Dispose()
    {
        _httpClient?.Dispose();
        _semaphore?.Dispose();
    }
}

// Usage
class Program
{
    static async Task Main(string[] args)
    {
        var scraper = new RateLimitedScraper(maxConcurrentRequests: 2, delayMs: 1500);

        var urls = new[]
        {
            "https://example.com/page1",
            "https://example.com/page2",
            "https://example.com/page3"
        };

        var results = await scraper.ScrapeMultipleUrlsAsync(urls);

        foreach (var content in results)
        {
            Console.WriteLine($"Content length: {content.Length}");
        }

        scraper.Dispose();
    }
}

Method 2: Custom Token Bucket Rate Limiter

For more sophisticated rate limiting, implement a token bucket algorithm:

using System;
using System.Threading;
using System.Threading.Tasks;

public class TokenBucketRateLimiter
{
    private readonly int _maxTokens;
    private readonly TimeSpan _refillInterval;
    private readonly SemaphoreSlim _semaphore;
    private readonly Timer _refillTimer;
    private int _currentTokens;

    public TokenBucketRateLimiter(int maxTokens, TimeSpan refillInterval)
    {
        _maxTokens = maxTokens;
        _refillInterval = refillInterval;
        _currentTokens = maxTokens;
        _semaphore = new SemaphoreSlim(1, 1);

        _refillTimer = new Timer(RefillTokens, null, refillInterval, refillInterval);
    }

    public async Task<bool> TryConsumeTokenAsync()
    {
        await _semaphore.WaitAsync();
        try
        {
            if (_currentTokens > 0)
            {
                _currentTokens--;
                return true;
            }
            return false;
        }
        finally
        {
            _semaphore.Release();
        }
    }

    private async void RefillTokens(object state)
    {
        await _semaphore.WaitAsync();
        try
        {
            _currentTokens = Math.Min(_currentTokens + 1, _maxTokens);
        }
        finally
        {
            _semaphore.Release();
        }
    }

    public void Dispose()
    {
        _refillTimer?.Dispose();
        _semaphore?.Dispose();
    }
}

// Usage with web scraping
public class TokenBucketScraper
{
    private readonly HttpClient _httpClient;
    private readonly TokenBucketRateLimiter _rateLimiter;

    public TokenBucketScraper(int tokensPerSecond)
    {
        _httpClient = new HttpClient();
        _rateLimiter = new TokenBucketRateLimiter(
            maxTokens: tokensPerSecond, 
            refillInterval: TimeSpan.FromSeconds(1)
        );
    }

    public async Task<string> ScrapeWithRateLimitAsync(string url)
    {
        while (!await _rateLimiter.TryConsumeTokenAsync())
        {
            await Task.Delay(100); // Wait before trying again
        }

        var response = await _httpClient.GetAsync(url);
        response.EnsureSuccessStatusCode();
        return await response.Content.ReadAsStringAsync();
    }
}

Method 3: Using Polly for Advanced Rate Limiting

Install the Polly package for resilience patterns:

Install-Package Polly.Extensions.Http
using System;
using System.Net.Http;
using System.Threading.Tasks;
using Polly;
using Polly.Extensions.Http;
using Polly.CircuitBreaker;

public class PollyScraper
{
    private readonly HttpClient _httpClient;

    public PollyScraper()
    {
        var retryPolicy = HttpPolicyExtensions
            .HandleTransientHttpError()
            .Or<TaskCanceledException>()
            .WaitAndRetryAsync(
                retryCount: 3,
                sleepDurationProvider: retryAttempt => 
                    TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)) // Exponential backoff
            );

        var circuitBreakerPolicy = HttpPolicyExtensions
            .HandleTransientHttpError()
            .CircuitBreakerAsync(
                handledEventsAllowedBeforeBreaking: 3,
                durationOfBreak: TimeSpan.FromSeconds(30)
            );

        var timeoutPolicy = Policy.TimeoutAsync<HttpResponseMessage>(10);

        var combinedPolicy = Policy.WrapAsync(retryPolicy, circuitBreakerPolicy, timeoutPolicy);

        _httpClient = new HttpClient();
        _httpClient.Timeout = TimeSpan.FromSeconds(30);
    }

    public async Task<string> ScrapeWithPollyAsync(string url)
    {
        try
        {
            var response = await Policy
                .HandleResult<HttpResponseMessage>(r => !r.IsSuccessStatusCode)
                .WaitAndRetryAsync(
                    retryCount: 3,
                    sleepDurationProvider: retryAttempt => TimeSpan.FromSeconds(retryAttempt * 2)
                )
                .ExecuteAsync(async () =>
                {
                    await Task.Delay(1000); // Rate limiting delay
                    return await _httpClient.GetAsync(url);
                });

            response.EnsureSuccessStatusCode();
            return await response.Content.ReadAsStringAsync();
        }
        catch (CircuitBreakerOpenException)
        {
            Console.WriteLine("Circuit breaker is open. Requests are being blocked.");
            throw;
        }
    }
}

Method 4: Adaptive Rate Limiting with Response Monitoring

Implement intelligent rate limiting that adjusts based on server responses:

public class AdaptiveRateLimitedScraper
{
    private readonly HttpClient _httpClient;
    private int _currentDelayMs = 1000;
    private readonly int _minDelayMs = 500;
    private readonly int _maxDelayMs = 10000;

    public AdaptiveRateLimitedScraper()
    {
        _httpClient = new HttpClient();
    }

    public async Task<string> ScrapeWithAdaptiveRateLimitAsync(string url)
    {
        while (true)
        {
            try
            {
                await Task.Delay(_currentDelayMs);

                var response = await _httpClient.GetAsync(url);

                if (response.IsSuccessStatusCode)
                {
                    // Success - gradually decrease delay
                    _currentDelayMs = Math.Max(_minDelayMs, _currentDelayMs - 100);
                    return await response.Content.ReadAsStringAsync();
                }
                else if (response.StatusCode == System.Net.HttpStatusCode.TooManyRequests)
                {
                    // Rate limited - increase delay
                    _currentDelayMs = Math.Min(_maxDelayMs, _currentDelayMs * 2);

                    // Check for Retry-After header
                    if (response.Headers.RetryAfter?.Delta.HasValue == true)
                    {
                        await Task.Delay(response.Headers.RetryAfter.Delta.Value);
                    }

                    continue; // Retry the request
                }
                else
                {
                    response.EnsureSuccessStatusCode();
                }
            }
            catch (HttpRequestException ex)
            {
                Console.WriteLine($"Request failed: {ex.Message}");
                _currentDelayMs = Math.Min(_maxDelayMs, _currentDelayMs * 2);
                await Task.Delay(_currentDelayMs);
            }
        }
    }
}

Best Practices for Rate Limiting

1. Randomized Delays

Add jitter to avoid predictable patterns:

private static readonly Random _random = new Random();

private async Task RandomDelayAsync(int baseDelayMs)
{
    var jitter = _random.Next(-200, 201); // ±200ms variation
    var delay = Math.Max(100, baseDelayMs + jitter);
    await Task.Delay(delay);
}

2. Respect robots.txt and Server Headers

public async Task<bool> CheckRobotsPermissionAsync(string baseUrl, string userAgent = "*")
{
    try
    {
        var robotsUrl = new Uri(new Uri(baseUrl), "/robots.txt").ToString();
        var robotsContent = await _httpClient.GetStringAsync(robotsUrl);

        // Parse robots.txt content (implement parsing logic)
        return ParseRobotsPermission(robotsContent, userAgent);
    }
    catch
    {
        return true; // Assume allowed if robots.txt is not accessible
    }
}

3. Monitor Response Times

public async Task<string> ScrapeWithMonitoringAsync(string url)
{
    var stopwatch = System.Diagnostics.Stopwatch.StartNew();

    var response = await _httpClient.GetAsync(url);
    stopwatch.Stop();

    // Adjust delay based on response time
    if (stopwatch.ElapsedMilliseconds > 5000) // Slow response
    {
        _currentDelayMs = Math.Min(_maxDelayMs, _currentDelayMs * 2);
    }

    await Task.Delay(_currentDelayMs);
    return await response.Content.ReadAsStringAsync();
}

Key Considerations

  • Start conservative: Begin with longer delays (2-3 seconds) and adjust based on server response
  • Handle HTTP 429: Always implement proper handling for "Too Many Requests" responses
  • Use connection pooling: Reuse HttpClient instances to avoid socket exhaustion
  • Monitor server health: Watch for slow responses or error rates that indicate overloading
  • Implement exponential backoff: Increase delays progressively when encountering errors
  • Consider time-based limits: Some sites have hourly or daily request limits

Remember that responsible web scraping protects both your application and the target servers, ensuring sustainable access to web data.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon