How do I handle connection pooling in Requests?
Connection pooling is a critical optimization technique when using the Python Requests library for web scraping or API interactions. It allows you to reuse TCP connections instead of establishing new ones for each request, significantly improving performance and reducing server load.
Understanding Connection Pooling in Requests
By default, the Requests library already implements basic connection pooling through HTTP adapters. When you use a Session
object, it automatically reuses connections to the same host. However, you can fine-tune this behavior for better performance.
Basic Connection Pooling with Sessions
The simplest way to enable connection pooling is by using a Session
object:
import requests
# Create a session object for connection pooling
session = requests.Session()
# Make multiple requests to the same host
for i in range(10):
response = session.get('https://api.example.com/data')
print(f"Request {i}: {response.status_code}")
# Close the session when done
session.close()
Custom Connection Pool Configuration
For more control over connection pooling, you can configure custom HTTP adapters:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class PooledSession:
def __init__(self, pool_connections=10, pool_maxsize=20, max_retries=3):
"""
Initialize a session with custom connection pooling
Args:
pool_connections: Number of connection pools to cache
pool_maxsize: Maximum number of connections in each pool
max_retries: Number of retry attempts
"""
self.session = requests.Session()
# Configure retry strategy
retry_strategy = Retry(
total=max_retries,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
backoff_factor=1
)
# Create HTTP adapter with custom pool settings
adapter = HTTPAdapter(
pool_connections=pool_connections,
pool_maxsize=pool_maxsize,
max_retries=retry_strategy
)
# Mount adapter for both HTTP and HTTPS
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
def get(self, url, **kwargs):
return self.session.get(url, **kwargs)
def post(self, url, **kwargs):
return self.session.post(url, **kwargs)
def close(self):
self.session.close()
# Usage example
pooled_session = PooledSession(pool_connections=5, pool_maxsize=15)
try:
# Make multiple requests efficiently
urls = [
'https://api.example.com/users/1',
'https://api.example.com/users/2',
'https://api.example.com/posts/1',
'https://api.example.com/posts/2'
]
for url in urls:
response = pooled_session.get(url)
print(f"Status: {response.status_code}, URL: {url}")
finally:
pooled_session.close()
Advanced Pool Configuration
For high-performance web scraping scenarios, you might need more sophisticated pool management:
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
import threading
import time
class AdvancedPooledSession:
def __init__(self, pool_connections=20, pool_maxsize=50,
max_retries=3, timeout=30):
"""
Advanced session with optimized connection pooling
"""
self.session = requests.Session()
self.timeout = timeout
# Custom pool manager for better control
class CustomHTTPAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
kwargs['block'] = True # Block when pool is full
kwargs['maxsize'] = pool_maxsize
kwargs['num_pools'] = pool_connections
return super().init_poolmanager(*args, **kwargs)
adapter = CustomHTTPAdapter(
pool_connections=pool_connections,
pool_maxsize=pool_maxsize,
max_retries=max_retries
)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# Set common headers to avoid connection resets
self.session.headers.update({
'User-Agent': 'Python-Requests-Pool/1.0',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate'
})
def request(self, method, url, **kwargs):
"""Make a request with default timeout"""
kwargs.setdefault('timeout', self.timeout)
return self.session.request(method, url, **kwargs)
def get(self, url, **kwargs):
return self.request('GET', url, **kwargs)
def post(self, url, **kwargs):
return self.request('POST', url, **kwargs)
def close(self):
self.session.close()
# Example: Concurrent requests with connection pooling
import concurrent.futures
def fetch_url(session, url):
"""Fetch a single URL using the pooled session"""
try:
response = session.get(url)
return {
'url': url,
'status_code': response.status_code,
'response_time': response.elapsed.total_seconds()
}
except requests.RequestException as e:
return {'url': url, 'error': str(e)}
# Initialize advanced pooled session
advanced_session = AdvancedPooledSession(
pool_connections=10,
pool_maxsize=25,
timeout=15
)
# URLs to scrape
urls = [f'https://httpbin.org/delay/{i%3}' for i in range(20)]
try:
# Use ThreadPoolExecutor for concurrent requests
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(fetch_url, advanced_session, url) for url in urls]
for future in concurrent.futures.as_completed(futures):
result = future.result()
if 'error' in result:
print(f"Error fetching {result['url']}: {result['error']}")
else:
print(f"Success: {result['url']} - {result['status_code']} "
f"({result['response_time']:.2f}s)")
finally:
advanced_session.close()
Monitoring Connection Pool Usage
You can monitor your connection pool usage to optimize performance:
import requests
from requests.adapters import HTTPAdapter
class MonitoredSession:
def __init__(self):
self.session = requests.Session()
self.adapter = HTTPAdapter(pool_connections=5, pool_maxsize=10)
self.session.mount("http://", self.adapter)
self.session.mount("https://", self.adapter)
def get_pool_stats(self):
"""Get connection pool statistics"""
stats = {}
for prefix, adapter in self.session.adapters.items():
if hasattr(adapter, 'poolmanager'):
pm = adapter.poolmanager
stats[prefix] = {
'num_pools': getattr(pm, 'num_pools', 'N/A'),
'pools': len(pm.pools) if hasattr(pm, 'pools') else 'N/A'
}
return stats
def request(self, method, url, **kwargs):
response = self.session.request(method, url, **kwargs)
# Print pool stats after each request
stats = self.get_pool_stats()
print(f"Pool stats after {method} {url}: {stats}")
return response
# Usage
monitored_session = MonitoredSession()
response = monitored_session.request('GET', 'https://httpbin.org/json')
Best Practices for Connection Pooling
1. Always Use Sessions for Multiple Requests
# Good: Uses connection pooling
session = requests.Session()
for url in urls:
response = session.get(url)
# Bad: Creates new connection each time
for url in urls:
response = requests.get(url) # No pooling
2. Configure Appropriate Pool Sizes
# For high-throughput applications
adapter = HTTPAdapter(
pool_connections=20, # Number of hosts to cache connections for
pool_maxsize=50 # Max connections per host
)
3. Set Connection Timeouts
session = requests.Session()
session.get(url, timeout=(5, 30)) # (connect_timeout, read_timeout)
4. Handle Pool Exhaustion
try:
response = session.get(url, timeout=30)
except requests.exceptions.ConnectionError as e:
if "pool is full" in str(e).lower():
print("Connection pool is exhausted, consider increasing pool_maxsize")
time.sleep(1) # Brief delay before retry
raise
Integration with Web Scraping Workflows
When building web scrapers, connection pooling becomes essential for performance. Consider implementing session management similar to how to handle browser sessions in Puppeteer but for HTTP requests:
class WebScrapingSession:
def __init__(self, base_url, max_workers=5):
self.base_url = base_url
self.max_workers = max_workers
self.session = self._create_session()
def _create_session(self):
session = requests.Session()
adapter = HTTPAdapter(
pool_connections=self.max_workers,
pool_maxsize=self.max_workers * 2
)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def scrape_pages(self, page_urls):
"""Scrape multiple pages efficiently"""
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [executor.submit(self._scrape_page, url) for url in page_urls]
return [future.result() for future in concurrent.futures.as_completed(futures)]
def _scrape_page(self, url):
full_url = f"{self.base_url.rstrip('/')}/{url.lstrip('/')}"
response = self.session.get(full_url)
return {
'url': full_url,
'status': response.status_code,
'content_length': len(response.content)
}
def close(self):
self.session.close()
Common Issues and Solutions
Pool Exhaustion
If you encounter "HTTPSConnectionPool is full" errors, increase your pool_maxsize
or implement connection throttling.
Memory Leaks
Always close sessions when done to prevent memory leaks:
try:
# Your requests here
pass
finally:
session.close()
SSL Performance
For HTTPS-heavy workloads, consider SSL session reuse by keeping connections alive longer.
Connection pooling in Requests is essential for building efficient, scalable web scraping applications. By properly configuring pool sizes, implementing retry strategies, and monitoring pool usage, you can significantly improve the performance of your HTTP-based applications while being respectful to target servers.