How do you handle API versioning when scraping third-party services?
API versioning is a critical consideration when scraping third-party services, as APIs evolve over time with changes to endpoints, response formats, and authentication methods. Proper version handling ensures your scraping applications remain robust and maintainable as target services update their APIs.
Understanding API Versioning Strategies
Third-party services typically implement versioning through several common approaches:
URL Path Versioning
The most common approach embeds the version directly in the URL path:
# Version in URL path
https://api.example.com/v1/users
https://api.example.com/v2/users
https://api.example.com/v3/users
Header-Based Versioning
Some APIs use custom headers to specify versions:
# Header-based versioning
curl -H "API-Version: 2.1" https://api.example.com/users
curl -H "Accept: application/vnd.api+json;version=2" https://api.example.com/users
Query Parameter Versioning
Version specified as a query parameter:
# Query parameter versioning
https://api.example.com/users?version=2.1
https://api.example.com/users?api_version=v2
Implementing Version Detection and Management
Python Implementation
Here's a robust Python approach for handling API versioning:
import requests
import json
from typing import Dict, Optional, List
from dataclasses import dataclass
from enum import Enum
class VersionStrategy(Enum):
PATH = "path"
HEADER = "header"
QUERY = "query"
@dataclass
class APIVersion:
version: str
strategy: VersionStrategy
base_url: str
headers: Dict[str, str] = None
is_deprecated: bool = False
sunset_date: Optional[str] = None
class VersionedAPIClient:
def __init__(self, base_url: str):
self.base_url = base_url
self.supported_versions = []
self.current_version = None
self.session = requests.Session()
def add_version(self, version: APIVersion):
"""Add a supported API version"""
self.supported_versions.append(version)
if self.current_version is None:
self.current_version = version
def detect_available_versions(self) -> List[str]:
"""Probe the API to detect available versions"""
available_versions = []
# Try common version endpoints
version_endpoints = [
"/versions",
"/api/versions",
"/v1/versions",
"/"
]
for endpoint in version_endpoints:
try:
response = self.session.get(f"{self.base_url}{endpoint}")
if response.status_code == 200:
data = response.json()
# Parse version information from response
if "versions" in data:
available_versions.extend(data["versions"])
elif "api_versions" in data:
available_versions.extend(data["api_versions"])
break
except Exception:
continue
return available_versions
def make_request(self, endpoint: str, method: str = "GET", **kwargs) -> requests.Response:
"""Make a versioned API request"""
if not self.current_version:
raise ValueError("No API version configured")
version = self.current_version
url = self._build_versioned_url(endpoint, version)
headers = kwargs.get('headers', {})
# Apply version-specific headers
if version.strategy == VersionStrategy.HEADER:
if version.headers:
headers.update(version.headers)
else:
headers['API-Version'] = version.version
# Add query parameters for query-based versioning
params = kwargs.get('params', {})
if version.strategy == VersionStrategy.QUERY:
params['version'] = version.version
try:
response = self.session.request(
method=method,
url=url,
headers=headers,
params=params,
**{k: v for k, v in kwargs.items() if k not in ['headers', 'params']}
)
# Check for version deprecation warnings
if 'Sunset' in response.headers:
print(f"Warning: API version {version.version} will be deprecated on {response.headers['Sunset']}")
if 'Deprecation' in response.headers:
print(f"Warning: API version {version.version} is deprecated")
return response
except requests.exceptions.RequestException as e:
return self._handle_version_error(e, endpoint, method, **kwargs)
def _build_versioned_url(self, endpoint: str, version: APIVersion) -> str:
"""Build URL based on versioning strategy"""
if version.strategy == VersionStrategy.PATH:
return f"{version.base_url}/{version.version}{endpoint}"
else:
return f"{version.base_url}{endpoint}"
def _handle_version_error(self, error: Exception, endpoint: str, method: str, **kwargs):
"""Handle version-related errors with fallback logic"""
print(f"Error with version {self.current_version.version}: {error}")
# Try fallback to older versions
for version in reversed(self.supported_versions):
if version != self.current_version and not version.is_deprecated:
print(f"Attempting fallback to version {version.version}")
old_version = self.current_version
self.current_version = version
try:
return self.make_request(endpoint, method, **kwargs)
except Exception:
continue
finally:
self.current_version = old_version
raise error
# Usage example
client = VersionedAPIClient("https://api.example.com")
# Configure supported versions
client.add_version(APIVersion(
version="v3",
strategy=VersionStrategy.PATH,
base_url="https://api.example.com",
is_deprecated=False
))
client.add_version(APIVersion(
version="v2",
strategy=VersionStrategy.PATH,
base_url="https://api.example.com",
is_deprecated=True,
sunset_date="2024-12-31"
))
# Make versioned requests
try:
response = client.make_request("/users")
users = response.json()
print(f"Retrieved {len(users)} users using API version {client.current_version.version}")
except Exception as e:
print(f"Failed to retrieve users: {e}")
JavaScript Implementation
Here's a JavaScript/Node.js approach for handling API versioning:
const axios = require('axios');
class VersionedAPIClient {
constructor(baseUrl) {
this.baseUrl = baseUrl;
this.supportedVersions = new Map();
this.currentVersion = null;
this.client = axios.create({
timeout: 10000,
validateStatus: (status) => status < 500 // Don't throw on 4xx errors
});
// Add response interceptor for version warnings
this.client.interceptors.response.use(
(response) => {
this.checkVersionWarnings(response);
return response;
},
(error) => {
return this.handleVersionError(error);
}
);
}
addVersion(versionConfig) {
this.supportedVersions.set(versionConfig.version, versionConfig);
if (!this.currentVersion) {
this.currentVersion = versionConfig;
}
}
async detectAvailableVersions() {
const versionEndpoints = ['/versions', '/api/versions', '/v1/versions', '/'];
for (const endpoint of versionEndpoints) {
try {
const response = await this.client.get(`${this.baseUrl}${endpoint}`);
if (response.status === 200 && response.data) {
const data = response.data;
if (data.versions) return data.versions;
if (data.api_versions) return data.api_versions;
}
} catch (error) {
continue;
}
}
return [];
}
async makeRequest(endpoint, options = {}) {
if (!this.currentVersion) {
throw new Error('No API version configured');
}
const version = this.currentVersion;
const config = {
method: options.method || 'GET',
url: this.buildVersionedUrl(endpoint, version),
headers: { ...options.headers },
params: { ...options.params },
...options
};
// Apply versioning strategy
switch (version.strategy) {
case 'header':
config.headers[version.headerName || 'API-Version'] = version.version;
break;
case 'query':
config.params[version.paramName || 'version'] = version.version;
break;
}
try {
const response = await this.client(config);
if (response.status >= 400) {
throw new Error(`API request failed with status ${response.status}`);
}
return response;
} catch (error) {
return this.handleVersionError(error, endpoint, options);
}
}
buildVersionedUrl(endpoint, version) {
if (version.strategy === 'path') {
return `${version.baseUrl || this.baseUrl}/${version.version}${endpoint}`;
}
return `${version.baseUrl || this.baseUrl}${endpoint}`;
}
checkVersionWarnings(response) {
const headers = response.headers;
if (headers.sunset) {
console.warn(`API version ${this.currentVersion.version} will sunset on ${headers.sunset}`);
}
if (headers.deprecation) {
console.warn(`API version ${this.currentVersion.version} is deprecated`);
}
if (headers['x-api-version-warning']) {
console.warn(`Version warning: ${headers['x-api-version-warning']}`);
}
}
async handleVersionError(error, endpoint, options) {
if (!endpoint) return Promise.reject(error);
console.warn(`Error with version ${this.currentVersion.version}:`, error.message);
// Try fallback versions
const sortedVersions = Array.from(this.supportedVersions.values())
.filter(v => v !== this.currentVersion && !v.deprecated)
.sort((a, b) => b.priority || 0 - a.priority || 0);
for (const version of sortedVersions) {
console.log(`Attempting fallback to version ${version.version}`);
const oldVersion = this.currentVersion;
this.currentVersion = version;
try {
const response = await this.makeRequest(endpoint, options);
console.log(`Successfully fell back to version ${version.version}`);
return response;
} catch (fallbackError) {
this.currentVersion = oldVersion;
continue;
}
}
return Promise.reject(error);
}
}
// Usage example
const client = new VersionedAPIClient('https://api.example.com');
// Configure versions
client.addVersion({
version: 'v3',
strategy: 'path',
baseUrl: 'https://api.example.com',
priority: 3
});
client.addVersion({
version: 'v2',
strategy: 'path',
baseUrl: 'https://api.example.com',
deprecated: true,
priority: 2
});
client.addVersion({
version: '2.1',
strategy: 'header',
headerName: 'Accept',
version: 'application/vnd.api+json;version=2.1',
priority: 1
});
// Make requests with automatic version handling
async function scrapeUsers() {
try {
const response = await client.makeRequest('/users');
console.log(`Retrieved users using version ${client.currentVersion.version}`);
return response.data;
} catch (error) {
console.error('Failed to retrieve users:', error.message);
}
}
Best Practices for Version Management
1. Version Discovery and Documentation
Always start by discovering available API versions:
# Check API documentation endpoint
curl https://api.example.com/
curl https://api.example.com/versions
curl https://api.example.com/docs
# Look for version headers in responses
curl -I https://api.example.com/users
2. Graceful Degradation Strategy
Implement a fallback hierarchy from newest to oldest supported versions:
# Version priority configuration
VERSION_PRIORITY = [
{"version": "v3", "preferred": True},
{"version": "v2.1", "fallback": True},
{"version": "v2", "legacy": True},
]
def get_version_priority():
return sorted(VERSION_PRIORITY,
key=lambda x: (x.get("preferred", False),
not x.get("legacy", False)),
reverse=True)
3. Response Format Adaptation
Handle different response formats across versions:
class ResponseAdapter:
def __init__(self, version):
self.version = version
def adapt_user_response(self, response_data):
"""Adapt user response format across versions"""
if self.version.startswith('v3'):
return {
'id': response_data['user_id'],
'name': response_data['full_name'],
'email': response_data['email_address'],
'created': response_data['creation_date']
}
elif self.version.startswith('v2'):
return {
'id': response_data['id'],
'name': response_data['name'],
'email': response_data['email'],
'created': response_data['created_at']
}
else:
# v1 format
return response_data
4. Monitoring and Alerting
Implement monitoring for version-related issues:
import logging
from datetime import datetime, timedelta
class VersionMonitor:
def __init__(self):
self.version_metrics = {}
self.logger = logging.getLogger(__name__)
def track_version_usage(self, version, success=True):
"""Track version usage and success rates"""
if version not in self.version_metrics:
self.version_metrics[version] = {
'requests': 0,
'failures': 0,
'last_used': None
}
self.version_metrics[version]['requests'] += 1
self.version_metrics[version]['last_used'] = datetime.now()
if not success:
self.version_metrics[version]['failures'] += 1
def check_version_health(self, version):
"""Check if a version is healthy based on success rate"""
metrics = self.version_metrics.get(version, {})
requests = metrics.get('requests', 0)
failures = metrics.get('failures', 0)
if requests == 0:
return True # No data yet
failure_rate = failures / requests
return failure_rate < 0.1 # Consider healthy if < 10% failure rate
Advanced Versioning Considerations
Handling Breaking Changes
When APIs introduce breaking changes, implement data transformation layers. For scenarios involving complex user interactions, consider using browser automation tools for handling authentication processes when API authentication methods change between versions.
Schema Validation
Implement schema validation to detect version-related changes:
import jsonschema
class VersionValidator:
def __init__(self):
self.schemas = {}
def add_schema(self, version, schema):
self.schemas[version] = schema
def validate_response(self, version, data):
if version in self.schemas:
try:
jsonschema.validate(data, self.schemas[version])
return True
except jsonschema.ValidationError as e:
print(f"Schema validation failed for version {version}: {e}")
return False
return True
Caching Version Information
Cache version metadata to avoid repeated discovery requests:
import pickle
from datetime import datetime, timedelta
class VersionCache:
def __init__(self, cache_duration_hours=24):
self.cache_file = 'api_versions.cache'
self.cache_duration = timedelta(hours=cache_duration_hours)
def get_cached_versions(self, api_base_url):
try:
with open(self.cache_file, 'rb') as f:
cache_data = pickle.load(f)
if api_base_url in cache_data:
entry = cache_data[api_base_url]
if datetime.now() - entry['timestamp'] < self.cache_duration:
return entry['versions']
except FileNotFoundError:
pass
return None
def cache_versions(self, api_base_url, versions):
try:
with open(self.cache_file, 'rb') as f:
cache_data = pickle.load(f)
except FileNotFoundError:
cache_data = {}
cache_data[api_base_url] = {
'versions': versions,
'timestamp': datetime.now()
}
with open(self.cache_file, 'wb') as f:
pickle.dump(cache_data, f)
Conclusion
Effective API versioning handling is essential for maintaining robust web scraping operations. By implementing version detection, graceful fallbacks, response adaptation, and comprehensive monitoring, you can ensure your scraping applications remain resilient to API changes.
The key is to build flexibility into your scraping architecture from the start, anticipating that APIs will evolve and preparing your code to adapt accordingly. Regular monitoring and proactive version management will help you stay ahead of deprecations and breaking changes, maintaining reliable data extraction even as third-party services update their APIs.
For complex scenarios involving dynamic content, consider combining API versioning strategies with browser automation techniques for monitoring network requests to track API version changes in real-time.