Rate limiting is crucial for responsible web scraping in C#. It prevents server overload, respects website policies, and reduces the risk of IP blocking. Here are several effective approaches to implement rate limiting in your C# web scraping tool.
Method 1: Using SemaphoreSlim for Concurrent Request Control
The most efficient approach for modern C# applications uses SemaphoreSlim
to control request concurrency:
using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
public class RateLimitedScraper
{
private readonly HttpClient _httpClient;
private readonly SemaphoreSlim _semaphore;
private readonly int _delayBetweenRequests;
public RateLimitedScraper(int maxConcurrentRequests = 1, int delayMs = 1000)
{
_httpClient = new HttpClient();
_semaphore = new SemaphoreSlim(maxConcurrentRequests, maxConcurrentRequests);
_delayBetweenRequests = delayMs;
}
public async Task<string> ScrapeUrlAsync(string url)
{
await _semaphore.WaitAsync();
try
{
var response = await _httpClient.GetAsync(url);
response.EnsureSuccessStatusCode();
await Task.Delay(_delayBetweenRequests);
return await response.Content.ReadAsStringAsync();
}
finally
{
_semaphore.Release();
}
}
public async Task<List<string>> ScrapeMultipleUrlsAsync(IEnumerable<string> urls)
{
var tasks = new List<Task<string>>();
foreach (var url in urls)
{
tasks.Add(ScrapeUrlAsync(url));
}
return new List<string>(await Task.WhenAll(tasks));
}
public void Dispose()
{
_httpClient?.Dispose();
_semaphore?.Dispose();
}
}
// Usage
class Program
{
static async Task Main(string[] args)
{
var scraper = new RateLimitedScraper(maxConcurrentRequests: 2, delayMs: 1500);
var urls = new[]
{
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
};
var results = await scraper.ScrapeMultipleUrlsAsync(urls);
foreach (var content in results)
{
Console.WriteLine($"Content length: {content.Length}");
}
scraper.Dispose();
}
}
Method 2: Custom Token Bucket Rate Limiter
For more sophisticated rate limiting, implement a token bucket algorithm:
using System;
using System.Threading;
using System.Threading.Tasks;
public class TokenBucketRateLimiter
{
private readonly int _maxTokens;
private readonly TimeSpan _refillInterval;
private readonly SemaphoreSlim _semaphore;
private readonly Timer _refillTimer;
private int _currentTokens;
public TokenBucketRateLimiter(int maxTokens, TimeSpan refillInterval)
{
_maxTokens = maxTokens;
_refillInterval = refillInterval;
_currentTokens = maxTokens;
_semaphore = new SemaphoreSlim(1, 1);
_refillTimer = new Timer(RefillTokens, null, refillInterval, refillInterval);
}
public async Task<bool> TryConsumeTokenAsync()
{
await _semaphore.WaitAsync();
try
{
if (_currentTokens > 0)
{
_currentTokens--;
return true;
}
return false;
}
finally
{
_semaphore.Release();
}
}
private async void RefillTokens(object state)
{
await _semaphore.WaitAsync();
try
{
_currentTokens = Math.Min(_currentTokens + 1, _maxTokens);
}
finally
{
_semaphore.Release();
}
}
public void Dispose()
{
_refillTimer?.Dispose();
_semaphore?.Dispose();
}
}
// Usage with web scraping
public class TokenBucketScraper
{
private readonly HttpClient _httpClient;
private readonly TokenBucketRateLimiter _rateLimiter;
public TokenBucketScraper(int tokensPerSecond)
{
_httpClient = new HttpClient();
_rateLimiter = new TokenBucketRateLimiter(
maxTokens: tokensPerSecond,
refillInterval: TimeSpan.FromSeconds(1)
);
}
public async Task<string> ScrapeWithRateLimitAsync(string url)
{
while (!await _rateLimiter.TryConsumeTokenAsync())
{
await Task.Delay(100); // Wait before trying again
}
var response = await _httpClient.GetAsync(url);
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync();
}
}
Method 3: Using Polly for Advanced Rate Limiting
Install the Polly package for resilience patterns:
Install-Package Polly.Extensions.Http
using System;
using System.Net.Http;
using System.Threading.Tasks;
using Polly;
using Polly.Extensions.Http;
using Polly.CircuitBreaker;
public class PollyScraper
{
private readonly HttpClient _httpClient;
public PollyScraper()
{
var retryPolicy = HttpPolicyExtensions
.HandleTransientHttpError()
.Or<TaskCanceledException>()
.WaitAndRetryAsync(
retryCount: 3,
sleepDurationProvider: retryAttempt =>
TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)) // Exponential backoff
);
var circuitBreakerPolicy = HttpPolicyExtensions
.HandleTransientHttpError()
.CircuitBreakerAsync(
handledEventsAllowedBeforeBreaking: 3,
durationOfBreak: TimeSpan.FromSeconds(30)
);
var timeoutPolicy = Policy.TimeoutAsync<HttpResponseMessage>(10);
var combinedPolicy = Policy.WrapAsync(retryPolicy, circuitBreakerPolicy, timeoutPolicy);
_httpClient = new HttpClient();
_httpClient.Timeout = TimeSpan.FromSeconds(30);
}
public async Task<string> ScrapeWithPollyAsync(string url)
{
try
{
var response = await Policy
.HandleResult<HttpResponseMessage>(r => !r.IsSuccessStatusCode)
.WaitAndRetryAsync(
retryCount: 3,
sleepDurationProvider: retryAttempt => TimeSpan.FromSeconds(retryAttempt * 2)
)
.ExecuteAsync(async () =>
{
await Task.Delay(1000); // Rate limiting delay
return await _httpClient.GetAsync(url);
});
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync();
}
catch (CircuitBreakerOpenException)
{
Console.WriteLine("Circuit breaker is open. Requests are being blocked.");
throw;
}
}
}
Method 4: Adaptive Rate Limiting with Response Monitoring
Implement intelligent rate limiting that adjusts based on server responses:
public class AdaptiveRateLimitedScraper
{
private readonly HttpClient _httpClient;
private int _currentDelayMs = 1000;
private readonly int _minDelayMs = 500;
private readonly int _maxDelayMs = 10000;
public AdaptiveRateLimitedScraper()
{
_httpClient = new HttpClient();
}
public async Task<string> ScrapeWithAdaptiveRateLimitAsync(string url)
{
while (true)
{
try
{
await Task.Delay(_currentDelayMs);
var response = await _httpClient.GetAsync(url);
if (response.IsSuccessStatusCode)
{
// Success - gradually decrease delay
_currentDelayMs = Math.Max(_minDelayMs, _currentDelayMs - 100);
return await response.Content.ReadAsStringAsync();
}
else if (response.StatusCode == System.Net.HttpStatusCode.TooManyRequests)
{
// Rate limited - increase delay
_currentDelayMs = Math.Min(_maxDelayMs, _currentDelayMs * 2);
// Check for Retry-After header
if (response.Headers.RetryAfter?.Delta.HasValue == true)
{
await Task.Delay(response.Headers.RetryAfter.Delta.Value);
}
continue; // Retry the request
}
else
{
response.EnsureSuccessStatusCode();
}
}
catch (HttpRequestException ex)
{
Console.WriteLine($"Request failed: {ex.Message}");
_currentDelayMs = Math.Min(_maxDelayMs, _currentDelayMs * 2);
await Task.Delay(_currentDelayMs);
}
}
}
}
Best Practices for Rate Limiting
1. Randomized Delays
Add jitter to avoid predictable patterns:
private static readonly Random _random = new Random();
private async Task RandomDelayAsync(int baseDelayMs)
{
var jitter = _random.Next(-200, 201); // ±200ms variation
var delay = Math.Max(100, baseDelayMs + jitter);
await Task.Delay(delay);
}
2. Respect robots.txt and Server Headers
public async Task<bool> CheckRobotsPermissionAsync(string baseUrl, string userAgent = "*")
{
try
{
var robotsUrl = new Uri(new Uri(baseUrl), "/robots.txt").ToString();
var robotsContent = await _httpClient.GetStringAsync(robotsUrl);
// Parse robots.txt content (implement parsing logic)
return ParseRobotsPermission(robotsContent, userAgent);
}
catch
{
return true; // Assume allowed if robots.txt is not accessible
}
}
3. Monitor Response Times
public async Task<string> ScrapeWithMonitoringAsync(string url)
{
var stopwatch = System.Diagnostics.Stopwatch.StartNew();
var response = await _httpClient.GetAsync(url);
stopwatch.Stop();
// Adjust delay based on response time
if (stopwatch.ElapsedMilliseconds > 5000) // Slow response
{
_currentDelayMs = Math.Min(_maxDelayMs, _currentDelayMs * 2);
}
await Task.Delay(_currentDelayMs);
return await response.Content.ReadAsStringAsync();
}
Key Considerations
- Start conservative: Begin with longer delays (2-3 seconds) and adjust based on server response
- Handle HTTP 429: Always implement proper handling for "Too Many Requests" responses
- Use connection pooling: Reuse
HttpClient
instances to avoid socket exhaustion - Monitor server health: Watch for slow responses or error rates that indicate overloading
- Implement exponential backoff: Increase delays progressively when encountering errors
- Consider time-based limits: Some sites have hourly or daily request limits
Remember that responsible web scraping protects both your application and the target servers, ensuring sustainable access to web data.