What are HTTP ETags and how can I use them for caching?
HTTP ETags (Entity Tags) are a powerful HTTP caching mechanism that enables efficient validation of cached resources. They serve as unique identifiers for specific versions of resources, allowing clients and servers to determine whether a cached copy is still valid without transferring the entire resource again.
Understanding HTTP ETags
An ETag is a unique string generated by the server that represents a specific version of a resource. When the resource changes, the server generates a new ETag. This mechanism allows for efficient cache validation through conditional requests.
Types of ETags
Strong ETags: Guarantee that two resources with identical ETags are byte-for-byte identical.
ETag: "33a64df551425fcc55e4d42a148795d9f25f89d4"
Weak ETags: Indicate semantic equivalence but not necessarily byte-for-byte identity, prefixed with W/
.
ETag: W/"0815"
How ETags Work in HTTP Caching
The ETag caching workflow follows these steps:
- Initial Request: Client requests a resource
- Server Response: Server returns the resource with an ETag header
- Client Storage: Client caches both the resource and its ETag
- Subsequent Request: Client sends conditional request with
If-None-Match
header - Server Validation: Server compares the ETag with current resource version
- Response: Server returns either
304 Not Modified
or the updated resource with new ETag
Implementing ETag Caching in Python
Using requests library
import requests
from typing import Optional, Dict, Any
class ETagCache:
def __init__(self):
self.cache: Dict[str, Dict[str, Any]] = {}
def get_with_etag(self, url: str) -> requests.Response:
"""Fetch URL with ETag caching support"""
headers = {}
# Check if we have cached ETag for this URL
if url in self.cache:
cached_etag = self.cache[url].get('etag')
if cached_etag:
headers['If-None-Match'] = cached_etag
response = requests.get(url, headers=headers)
if response.status_code == 304:
# Resource not modified, return cached content
print(f"Cache hit for {url}")
cached_response = self.cache[url]['response']
return cached_response
elif response.status_code == 200:
# Resource updated or first request
etag = response.headers.get('ETag')
if etag:
self.cache[url] = {
'etag': etag,
'response': response
}
return response
return response
# Usage example
cache = ETagCache()
# First request - downloads full content
response1 = cache.get_with_etag('https://api.example.com/data')
print(f"Status: {response1.status_code}")
print(f"Content length: {len(response1.content)}")
# Second request - uses ETag validation
response2 = cache.get_with_etag('https://api.example.com/data')
print(f"Status: {response2.status_code}")
Advanced Python Implementation with File-based Persistence
import requests
import json
import os
import hashlib
from datetime import datetime
from pathlib import Path
class PersistentETagCache:
def __init__(self, cache_dir: str = "./etag_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.metadata_file = self.cache_dir / "metadata.json"
self.metadata = self._load_metadata()
def _load_metadata(self) -> dict:
"""Load cache metadata from file"""
if self.metadata_file.exists():
with open(self.metadata_file, 'r') as f:
return json.load(f)
return {}
def _save_metadata(self):
"""Save cache metadata to file"""
with open(self.metadata_file, 'w') as f:
json.dump(self.metadata, f, indent=2)
def _get_cache_key(self, url: str) -> str:
"""Generate cache key from URL"""
return hashlib.md5(url.encode()).hexdigest()
def get_with_etag(self, url: str, **kwargs) -> requests.Response:
"""Fetch URL with persistent ETag caching"""
cache_key = self._get_cache_key(url)
cache_file = self.cache_dir / f"{cache_key}.cache"
headers = kwargs.get('headers', {})
# Add If-None-Match header if we have cached ETag
if cache_key in self.metadata:
cached_etag = self.metadata[cache_key].get('etag')
if cached_etag:
headers['If-None-Match'] = cached_etag
kwargs['headers'] = headers
response = requests.get(url, **kwargs)
if response.status_code == 304 and cache_file.exists():
# Load cached response
with open(cache_file, 'rb') as f:
cached_content = f.read()
# Create mock response with cached content
cached_response = requests.Response()
cached_response._content = cached_content
cached_response.status_code = 200
cached_response.headers.update(self.metadata[cache_key].get('headers', {}))
print(f"ETag cache hit for {url}")
return cached_response
elif response.status_code == 200:
# Cache the response
etag = response.headers.get('ETag')
if etag:
# Save response content
with open(cache_file, 'wb') as f:
f.write(response.content)
# Save metadata
self.metadata[cache_key] = {
'url': url,
'etag': etag,
'headers': dict(response.headers),
'cached_at': datetime.now().isoformat()
}
self._save_metadata()
print(f"Cached response for {url} with ETag: {etag}")
return response
# Usage with persistent cache
persistent_cache = PersistentETagCache()
response = persistent_cache.get_with_etag('https://api.github.com/repos/microsoft/playwright')
Implementing ETag Caching in JavaScript
Using fetch API with ETag support
class ETagCache {
constructor() {
this.cache = new Map();
}
async fetchWithETag(url, options = {}) {
const cacheKey = url;
const cachedData = this.cache.get(cacheKey);
// Prepare headers
const headers = new Headers(options.headers || {});
// Add If-None-Match header if we have cached ETag
if (cachedData && cachedData.etag) {
headers.set('If-None-Match', cachedData.etag);
}
try {
const response = await fetch(url, {
...options,
headers
});
if (response.status === 304) {
// Resource not modified, return cached data
console.log(`ETag cache hit for ${url}`);
return {
ok: true,
status: 200,
data: cachedData.data,
fromCache: true
};
}
if (response.ok) {
const etag = response.headers.get('ETag');
const data = await response.json();
// Cache the response
if (etag) {
this.cache.set(cacheKey, {
etag,
data,
cachedAt: new Date().toISOString()
});
console.log(`Cached response for ${url} with ETag: ${etag}`);
}
return {
ok: true,
status: response.status,
data,
fromCache: false
};
}
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
} catch (error) {
console.error('ETag fetch error:', error);
throw error;
}
}
clearCache() {
this.cache.clear();
}
getCacheStats() {
return {
size: this.cache.size,
entries: Array.from(this.cache.keys())
};
}
}
// Usage example
const cache = new ETagCache();
async function fetchUserData() {
try {
const result = await cache.fetchWithETag('https://api.github.com/user', {
headers: {
'Authorization': 'token YOUR_TOKEN'
}
});
console.log('User data:', result.data);
console.log('From cache:', result.fromCache);
} catch (error) {
console.error('Error fetching user data:', error);
}
}
// First call - downloads data
await fetchUserData();
// Second call - uses ETag validation
await fetchUserData();
Node.js Implementation with File System Caching
const fs = require('fs').promises;
const path = require('path');
const crypto = require('crypto');
const fetch = require('node-fetch');
class NodeETagCache {
constructor(cacheDir = './etag_cache') {
this.cacheDir = cacheDir;
this.metadataFile = path.join(cacheDir, 'metadata.json');
this.ensureCacheDir();
}
async ensureCacheDir() {
try {
await fs.mkdir(this.cacheDir, { recursive: true });
} catch (error) {
if (error.code !== 'EEXIST') throw error;
}
}
async loadMetadata() {
try {
const data = await fs.readFile(this.metadataFile, 'utf8');
return JSON.parse(data);
} catch (error) {
return {};
}
}
async saveMetadata(metadata) {
await fs.writeFile(this.metadataFile, JSON.stringify(metadata, null, 2));
}
getCacheKey(url) {
return crypto.createHash('md5').update(url).digest('hex');
}
async fetchWithETag(url, options = {}) {
const cacheKey = this.getCacheKey(url);
const cacheFile = path.join(this.cacheDir, `${cacheKey}.cache`);
const metadata = await this.loadMetadata();
// Prepare headers
const headers = { ...options.headers };
// Add If-None-Match header
if (metadata[cacheKey] && metadata[cacheKey].etag) {
headers['If-None-Match'] = metadata[cacheKey].etag;
}
const response = await fetch(url, { ...options, headers });
if (response.status === 304) {
// Load cached content
try {
const cachedContent = await fs.readFile(cacheFile, 'utf8');
console.log(`ETag cache hit for ${url}`);
return {
ok: true,
status: 200,
data: JSON.parse(cachedContent),
fromCache: true
};
} catch (error) {
console.warn('Cache file not found, making fresh request');
}
}
if (response.ok) {
const etag = response.headers.get('etag');
const data = await response.json();
if (etag) {
// Save to cache
await fs.writeFile(cacheFile, JSON.stringify(data, null, 2));
// Update metadata
metadata[cacheKey] = {
url,
etag,
cachedAt: new Date().toISOString()
};
await this.saveMetadata(metadata);
console.log(`Cached response for ${url} with ETag: ${etag}`);
}
return {
ok: true,
status: response.status,
data,
fromCache: false
};
}
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
}
// Usage
const cache = new NodeETagCache();
async function main() {
try {
const result = await cache.fetchWithETag('https://api.github.com/repos/microsoft/playwright');
console.log('Repository data:', result.data.name);
console.log('From cache:', result.fromCache);
} catch (error) {
console.error('Error:', error);
}
}
main();
Testing ETag Implementation
Using curl to test ETag behavior
# Initial request - get ETag
curl -I https://api.github.com/repos/microsoft/playwright
# Response includes:
# ETag: "abcd1234567890"
# Conditional request using ETag
curl -H 'If-None-Match: "abcd1234567890"' \
-I https://api.github.com/repos/microsoft/playwright
# If unchanged, returns 304 Not Modified
Validating ETag cache implementation
def test_etag_cache():
"""Test ETag caching functionality"""
cache = ETagCache()
test_url = 'https://httpbin.org/etag/test-etag'
# First request
response1 = cache.get_with_etag(test_url)
print(f"First request - Status: {response1.status_code}")
# Second request (should use cache)
response2 = cache.get_with_etag(test_url)
print(f"Second request - Status: {response2.status_code}")
# Verify cache behavior
assert response1.status_code == 200
assert response2.status_code in [200, 304]
test_etag_cache()
Best Practices for ETag Caching
1. Handle ETag Edge Cases
def robust_etag_request(url, session=None):
"""Handle various ETag edge cases"""
if session is None:
session = requests.Session()
try:
response = session.get(url)
# Check for ETag presence
etag = response.headers.get('ETag')
if not etag:
print(f"Warning: No ETag header for {url}")
return response
# Handle weak ETags
if etag.startswith('W/'):
print(f"Weak ETag detected: {etag}")
# Validate ETag format
if not (etag.startswith('"') and etag.endswith('"')) and not etag.startswith('W/'):
print(f"Warning: Unusual ETag format: {etag}")
return response
except requests.RequestException as e:
print(f"Request failed: {e}")
raise
2. Combine ETags with Other Caching Headers
def comprehensive_cache_headers(response):
"""Extract all relevant caching information"""
cache_info = {
'etag': response.headers.get('ETag'),
'last_modified': response.headers.get('Last-Modified'),
'cache_control': response.headers.get('Cache-Control'),
'expires': response.headers.get('Expires'),
'max_age': None
}
# Parse Cache-Control for max-age
cache_control = cache_info['cache_control']
if cache_control:
for directive in cache_control.split(','):
directive = directive.strip()
if directive.startswith('max-age='):
cache_info['max_age'] = int(directive.split('=')[1])
return cache_info
3. Monitor Cache Performance
class CacheMetrics:
def __init__(self):
self.hits = 0
self.misses = 0
self.total_requests = 0
self.bandwidth_saved = 0
def record_hit(self, content_size=0):
self.hits += 1
self.total_requests += 1
self.bandwidth_saved += content_size
def record_miss(self):
self.misses += 1
self.total_requests += 1
def get_stats(self):
hit_rate = (self.hits / self.total_requests * 100) if self.total_requests > 0 else 0
return {
'hit_rate': f"{hit_rate:.2f}%",
'total_requests': self.total_requests,
'hits': self.hits,
'misses': self.misses,
'bandwidth_saved_kb': self.bandwidth_saved / 1024
}
ETags in Web Scraping Scenarios
When implementing web scraping solutions, ETags become particularly valuable for monitoring changes in target resources. By leveraging browser automation tools for handling AJAX requests, you can efficiently track dynamic content updates while minimizing bandwidth usage.
For applications requiring sophisticated session management and handling browser sessions, ETag validation ensures that scraped data remains current without unnecessary re-fetching of unchanged resources.
Common Pitfalls and Solutions
1. ETag Mismatch Issues
def handle_etag_mismatch(current_etag, cached_etag):
"""Handle cases where ETags don't match expectations"""
if current_etag != cached_etag:
print(f"ETag mismatch - Current: {current_etag}, Cached: {cached_etag}")
# Clear cache entry and refetch
return True # Indicates cache invalidation needed
return False
2. Server-Side ETag Inconsistencies
Some servers generate ETags inconsistently. Handle this by implementing fallback mechanisms:
def fetch_with_fallback(url, max_retries=3):
"""Implement fallback for unreliable ETag servers"""
for attempt in range(max_retries):
try:
response = requests.get(url)
etag = response.headers.get('ETag')
if etag and len(etag) > 2: # Basic ETag validation
return response
else:
print(f"Invalid ETag on attempt {attempt + 1}")
except Exception as e:
print(f"Attempt {attempt + 1} failed: {e}")
raise Exception(f"Failed to get valid response after {max_retries} attempts")
HTTP ETags provide an efficient mechanism for cache validation that significantly reduces bandwidth usage and improves application performance. By implementing proper ETag handling in your web scraping and API integration workflows, you can build more efficient and respectful applications that minimize server load while maintaining data freshness.
The key to successful ETag implementation lies in understanding the conditional request flow, handling edge cases gracefully, and monitoring cache performance to ensure optimal results. Whether you're building a simple API client or a complex web scraping system, ETags offer a standardized approach to intelligent caching that benefits both client and server applications.