How can I implement HTTP request batching for efficiency?
HTTP request batching is a powerful optimization technique that combines multiple individual requests into a single HTTP request, significantly improving performance and reducing network overhead. This approach is particularly valuable in web scraping, API interactions, and data-intensive applications where you need to make numerous HTTP calls.
Understanding HTTP Request Batching
Request batching reduces the number of round trips between client and server by grouping multiple operations into a single HTTP request. Instead of making 10 separate API calls, you can batch them into one request, reducing network latency, connection overhead, and server processing time.
Benefits of Request Batching
- Reduced Network Latency: Fewer round trips mean faster overall execution
- Lower Connection Overhead: Less TCP connection establishment and teardown
- Improved Server Efficiency: Servers can process multiple operations in one context
- Better Resource Utilization: More efficient use of bandwidth and connection pools
- Rate Limit Optimization: Fewer requests help stay within API rate limits
Implementation Strategies
1. Promise-Based Batching in JavaScript
Here's a JavaScript implementation that batches multiple HTTP requests:
class RequestBatcher {
constructor(options = {}) {
this.batchSize = options.batchSize || 5;
this.delay = options.delay || 100;
this.pendingRequests = [];
this.timeoutId = null;
}
async batchRequest(url, options = {}) {
return new Promise((resolve, reject) => {
this.pendingRequests.push({
url,
options,
resolve,
reject
});
// Clear existing timeout and set a new one
if (this.timeoutId) {
clearTimeout(this.timeoutId);
}
// Process batch when size limit is reached or after delay
if (this.pendingRequests.length >= this.batchSize) {
this.processBatch();
} else {
this.timeoutId = setTimeout(() => {
this.processBatch();
}, this.delay);
}
});
}
async processBatch() {
if (this.pendingRequests.length === 0) return;
const batch = this.pendingRequests.splice(0);
this.timeoutId = null;
try {
// Execute all requests concurrently
const promises = batch.map(({ url, options }) =>
fetch(url, options).then(response => response.json())
);
const results = await Promise.allSettled(promises);
// Resolve individual promises
results.forEach((result, index) => {
const { resolve, reject } = batch[index];
if (result.status === 'fulfilled') {
resolve(result.value);
} else {
reject(result.reason);
}
});
} catch (error) {
// Reject all pending requests if batch fails
batch.forEach(({ reject }) => reject(error));
}
}
}
// Usage example
const batcher = new RequestBatcher({ batchSize: 3, delay: 50 });
async function fetchMultipleUrls() {
const urls = [
'https://api.example.com/users/1',
'https://api.example.com/users/2',
'https://api.example.com/users/3',
'https://api.example.com/posts/1',
'https://api.example.com/posts/2'
];
try {
const results = await Promise.all(
urls.map(url => batcher.batchRequest(url))
);
console.log('Batched results:', results);
} catch (error) {
console.error('Batch request failed:', error);
}
}
2. Python Implementation with asyncio
Python's asyncio provides excellent support for request batching:
import asyncio
import aiohttp
import time
from typing import List, Dict, Any
from dataclasses import dataclass
@dataclass
class BatchRequest:
url: str
method: str = 'GET'
headers: Dict[str, str] = None
data: Any = None
params: Dict[str, str] = None
class HTTPRequestBatcher:
def __init__(self, max_batch_size: int = 10, batch_timeout: float = 0.1):
self.max_batch_size = max_batch_size
self.batch_timeout = batch_timeout
self.pending_requests = []
self.batch_task = None
self.session = None
async def __aenter__(self):
self.session = aiohttp.ClientSession()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def add_request(self, request: BatchRequest) -> Any:
"""Add a request to the batch and return a future for the result"""
future = asyncio.Future()
self.pending_requests.append((request, future))
# Start batch processing if not already running
if self.batch_task is None or self.batch_task.done():
self.batch_task = asyncio.create_task(self._process_batch_with_timeout())
# Process immediately if batch is full
if len(self.pending_requests) >= self.max_batch_size:
if not self.batch_task.done():
self.batch_task.cancel()
await self._process_current_batch()
return await future
async def _process_batch_with_timeout(self):
"""Process batch after timeout or when full"""
try:
await asyncio.sleep(self.batch_timeout)
await self._process_current_batch()
except asyncio.CancelledError:
pass
async def _process_current_batch(self):
"""Process all pending requests in the current batch"""
if not self.pending_requests:
return
batch = self.pending_requests.copy()
self.pending_requests.clear()
# Execute all requests concurrently
tasks = []
for request, future in batch:
task = asyncio.create_task(self._execute_request(request))
tasks.append((task, future))
# Wait for all requests to complete
for task, future in tasks:
try:
result = await task
future.set_result(result)
except Exception as e:
future.set_exception(e)
async def _execute_request(self, request: BatchRequest) -> Dict[str, Any]:
"""Execute a single HTTP request"""
async with self.session.request(
method=request.method,
url=request.url,
headers=request.headers,
data=request.data,
params=request.params
) as response:
return {
'status': response.status,
'headers': dict(response.headers),
'data': await response.json() if response.content_type == 'application/json' else await response.text()
}
# Usage example
async def fetch_multiple_resources():
urls = [
'https://jsonplaceholder.typicode.com/posts/1',
'https://jsonplaceholder.typicode.com/posts/2',
'https://jsonplaceholder.typicode.com/users/1',
'https://jsonplaceholder.typicode.com/users/2',
'https://jsonplaceholder.typicode.com/comments/1'
]
async with HTTPRequestBatcher(max_batch_size=3, batch_timeout=0.05) as batcher:
# Create batch requests
requests = [BatchRequest(url=url) for url in urls]
# Execute all requests
start_time = time.time()
results = await asyncio.gather(*[
batcher.add_request(req) for req in requests
])
end_time = time.time()
print(f"Processed {len(results)} requests in {end_time - start_time:.2f} seconds")
for i, result in enumerate(results):
print(f"Request {i+1}: Status {result['status']}")
# Run the example
asyncio.run(fetch_multiple_resources())
3. GraphQL-Style Batching
For APIs that support it, you can implement GraphQL-style batching:
class GraphQLBatcher {
constructor(endpoint, options = {}) {
this.endpoint = endpoint;
this.batchSize = options.batchSize || 10;
this.delay = options.delay || 50;
this.pendingQueries = [];
this.timeoutId = null;
}
async query(query, variables = {}) {
return new Promise((resolve, reject) => {
const id = Date.now() + Math.random();
this.pendingQueries.push({
id,
query,
variables,
resolve,
reject
});
this.scheduleExecution();
});
}
scheduleExecution() {
if (this.timeoutId) {
clearTimeout(this.timeoutId);
}
if (this.pendingQueries.length >= this.batchSize) {
this.executeBatch();
} else {
this.timeoutId = setTimeout(() => {
this.executeBatch();
}, this.delay);
}
}
async executeBatch() {
if (this.pendingQueries.length === 0) return;
const batch = this.pendingQueries.splice(0);
this.timeoutId = null;
try {
// Create batched query
const batchedQuery = batch.map(({ id, query, variables }) => ({
id,
query,
variables
}));
const response = await fetch(this.endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ batch: batchedQuery })
});
const results = await response.json();
// Resolve individual queries
batch.forEach(({ id, resolve, reject }) => {
const result = results.find(r => r.id === id);
if (result && result.errors) {
reject(new Error(result.errors[0].message));
} else if (result) {
resolve(result.data);
} else {
reject(new Error('Query result not found'));
}
});
} catch (error) {
batch.forEach(({ reject }) => reject(error));
}
}
}
Advanced Batching Patterns
Connection Pooling Integration
import aiohttp
from aiohttp import TCPConnector
class AdvancedBatcher:
def __init__(self, max_connections=100, max_batch_size=20):
# Configure connection pooling
connector = TCPConnector(
limit=max_connections,
limit_per_host=20,
keepalive_timeout=30,
enable_cleanup_closed=True
)
self.session = aiohttp.ClientSession(
connector=connector,
timeout=aiohttp.ClientTimeout(total=30)
)
self.max_batch_size = max_batch_size
self.semaphore = asyncio.Semaphore(max_connections)
async def batch_with_semaphore(self, urls):
"""Batch requests with connection limiting"""
async def fetch_with_semaphore(url):
async with self.semaphore:
async with self.session.get(url) as response:
return await response.json()
# Process in batches
results = []
for i in range(0, len(urls), self.max_batch_size):
batch = urls[i:i + self.max_batch_size]
batch_results = await asyncio.gather(*[
fetch_with_semaphore(url) for url in batch
], return_exceptions=True)
results.extend(batch_results)
return results
Smart Retry Logic
class ResilientBatcher {
constructor(options = {}) {
this.maxRetries = options.maxRetries || 3;
this.retryDelay = options.retryDelay || 1000;
this.batchSize = options.batchSize || 5;
}
async batchWithRetry(requests) {
let attempts = 0;
let lastError;
while (attempts < this.maxRetries) {
try {
return await this.executeBatch(requests);
} catch (error) {
lastError = error;
attempts++;
if (attempts < this.maxRetries) {
const delay = this.retryDelay * Math.pow(2, attempts - 1);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
}
throw new Error(`Batch failed after ${this.maxRetries} attempts: ${lastError.message}`);
}
async executeBatch(requests) {
const promises = requests.map(req =>
fetch(req.url, req.options)
.then(response => {
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
return response.json();
})
);
return await Promise.all(promises);
}
}
Performance Optimization Tips
1. Optimal Batch Sizing
The ideal batch size depends on several factors:
def calculate_optimal_batch_size(avg_response_time, network_latency, memory_limit):
"""Calculate optimal batch size based on system characteristics"""
# Rule of thumb: batch size should minimize total execution time
# while staying within memory constraints
overhead_per_request = network_latency * 0.1 # 10% of network latency
optimal_size = min(
int(memory_limit / avg_response_size), # Memory constraint
int(10000 / avg_response_time), # Time constraint
50 # Maximum reasonable batch size
)
return max(optimal_size, 2) # Minimum batch size of 2
2. Dynamic Batch Adjustment
class AdaptiveBatcher {
constructor() {
this.batchSize = 5;
this.performanceHistory = [];
this.adjustmentInterval = 100; // requests
this.requestCount = 0;
}
adjustBatchSize() {
if (this.performanceHistory.length < 10) return;
const avgTime = this.performanceHistory.reduce((a, b) => a + b) / this.performanceHistory.length;
const recentAvg = this.performanceHistory.slice(-5).reduce((a, b) => a + b) / 5;
if (recentAvg > avgTime * 1.2) {
// Performance degraded, reduce batch size
this.batchSize = Math.max(2, this.batchSize - 1);
} else if (recentAvg < avgTime * 0.8) {
// Performance improved, increase batch size
this.batchSize = Math.min(20, this.batchSize + 1);
}
}
}
Integration with Web Scraping
When implementing request batching for web scraping scenarios, you can combine it with browser automation techniques to maximize efficiency. For complex scenarios involving AJAX request handling, batching becomes even more critical.
Error Handling and Monitoring
import logging
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class BatchMetrics:
total_requests: int
successful_requests: int
failed_requests: int
avg_response_time: float
batch_size: int
class MonitoredBatcher:
def __init__(self):
self.logger = logging.getLogger(__name__)
self.metrics = []
async def execute_monitored_batch(self, requests: List[BatchRequest]) -> BatchMetrics:
start_time = time.time()
successful = 0
failed = 0
try:
results = await self.process_batch(requests)
successful = len([r for r in results if r.get('status', 0) < 400])
failed = len(requests) - successful
except Exception as e:
self.logger.error(f"Batch execution failed: {e}")
failed = len(requests)
metrics = BatchMetrics(
total_requests=len(requests),
successful_requests=successful,
failed_requests=failed,
avg_response_time=(time.time() - start_time) / len(requests),
batch_size=len(requests)
)
self.metrics.append(metrics)
return metrics
Console Commands for Testing
Test your batching implementation with these commands:
# Test concurrent requests with curl (simulate batching)
for i in {1..10}; do
curl -s "https://api.example.com/data/$i" &
done
wait
# Monitor network connections during batching
netstat -an | grep :80 | wc -l
# Benchmark batch vs individual requests
time python batch_test.py --batch-size=10
time python individual_test.py --requests=10
Conclusion
HTTP request batching is essential for building efficient, scalable applications. By implementing proper batching strategies, you can significantly reduce network overhead, improve response times, and better utilize system resources. The key is finding the right balance between batch size, timing, and error handling for your specific use case.
Remember to monitor your batching performance and adjust parameters based on real-world usage patterns. Consider factors like API rate limits, network conditions, and server capabilities when designing your batching strategy.