How do I validate response content types in Requests?
When building robust web scraping applications or API clients, validating response content types is crucial for ensuring data integrity and preventing processing errors. The Python Requests library provides several methods to check and validate the content type of HTTP responses before processing the data.
Understanding Content Types
Content types, also known as MIME types, indicate the format of data returned by a server. Common content types include:
application/json
- JSON datatext/html
- HTML documentstext/plain
- Plain textapplication/xml
- XML documentsimage/jpeg
- JPEG imagesapplication/pdf
- PDF files
Basic Content Type Validation
Checking the Content-Type Header
The most straightforward way to validate content types is by examining the Content-Type
header in the response:
import requests
def validate_content_type(url, expected_type):
response = requests.get(url)
content_type = response.headers.get('Content-Type', '')
if expected_type in content_type:
print(f"✓ Valid content type: {content_type}")
return True
else:
print(f"✗ Invalid content type. Expected: {expected_type}, Got: {content_type}")
return False
# Example usage
url = "https://api.example.com/data"
if validate_content_type(url, "application/json"):
data = response.json()
# Process JSON data
Using Response Properties
Requests provides convenient properties to check common content types:
import requests
response = requests.get("https://example.com/api/data")
# Check if response is JSON
if response.headers.get('Content-Type', '').startswith('application/json'):
try:
data = response.json()
print("Successfully parsed JSON data")
except ValueError:
print("Response claims to be JSON but parsing failed")
# Check if response is HTML
if 'text/html' in response.headers.get('Content-Type', ''):
print("Response is HTML content")
# Check if response is plain text
if 'text/plain' in response.headers.get('Content-Type', ''):
print("Response is plain text")
Advanced Content Type Validation
Creating a Content Type Validator Class
For more sophisticated validation, create a reusable validator class:
import requests
from typing import List, Optional
import mimetypes
class ContentTypeValidator:
def __init__(self, response: requests.Response):
self.response = response
self.content_type = response.headers.get('Content-Type', '').lower()
def is_json(self) -> bool:
"""Check if response is JSON"""
return 'application/json' in self.content_type
def is_html(self) -> bool:
"""Check if response is HTML"""
return 'text/html' in self.content_type
def is_xml(self) -> bool:
"""Check if response is XML"""
return any(xml_type in self.content_type for xml_type in
['application/xml', 'text/xml'])
def is_image(self) -> bool:
"""Check if response is an image"""
return self.content_type.startswith('image/')
def matches_expected(self, expected_types: List[str]) -> bool:
"""Check if content type matches any of the expected types"""
return any(expected in self.content_type for expected in expected_types)
def validate_or_raise(self, expected_types: List[str],
custom_message: Optional[str] = None):
"""Validate content type or raise an exception"""
if not self.matches_expected(expected_types):
message = custom_message or f"Expected {expected_types}, got {self.content_type}"
raise ValueError(message)
# Usage example
def fetch_and_validate(url: str, expected_types: List[str]):
response = requests.get(url)
validator = ContentTypeValidator(response)
try:
validator.validate_or_raise(expected_types)
return response
except ValueError as e:
print(f"Content type validation failed: {e}")
return None
# Fetch JSON data with validation
json_response = fetch_and_validate(
"https://api.example.com/data",
["application/json"]
)
Handling Complex Content Types
Parsing Content Type Parameters
Content-Type headers often include additional parameters like charset:
import requests
from email.message import Message
def parse_content_type(content_type_header):
"""Parse content type and its parameters"""
# Use email.message to parse the header properly
msg = Message()
msg['content-type'] = content_type_header
main_type = msg.get_content_type()
charset = msg.get_content_charset()
return {
'type': main_type,
'charset': charset,
'params': dict(msg.get_params()[1:]) if msg.get_params() else {}
}
# Example usage
response = requests.get("https://example.com")
content_type_info = parse_content_type(
response.headers.get('Content-Type', '')
)
print(f"Content Type: {content_type_info['type']}")
print(f"Charset: {content_type_info['charset']}")
print(f"Additional params: {content_type_info['params']}")
Validating Against Multiple Content Types
When working with APIs that might return different content types:
import requests
from typing import Dict, Callable
def create_content_handler() -> Dict[str, Callable]:
"""Create handlers for different content types"""
def handle_json(response):
return response.json()
def handle_xml(response):
import xml.etree.ElementTree as ET
return ET.fromstring(response.content)
def handle_html(response):
from bs4 import BeautifulSoup
return BeautifulSoup(response.content, 'html.parser')
def handle_text(response):
return response.text
return {
'application/json': handle_json,
'application/xml': handle_xml,
'text/xml': handle_xml,
'text/html': handle_html,
'text/plain': handle_text
}
def process_response_by_type(url: str):
response = requests.get(url)
content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
handlers = create_content_handler()
for mime_type, handler in handlers.items():
if mime_type in content_type:
try:
return handler(response)
except Exception as e:
print(f"Failed to process {mime_type}: {e}")
return None
print(f"Unsupported content type: {content_type}")
return None
Error Handling and Best Practices
Robust Content Type Validation
Implement comprehensive error handling for production applications:
import requests
import logging
from typing import Optional, Union
logger = logging.getLogger(__name__)
class ContentTypeError(Exception):
"""Custom exception for content type validation errors"""
pass
def safe_validate_content_type(response: requests.Response,
expected_types: list,
strict: bool = False) -> bool:
"""
Safely validate content type with comprehensive error handling
Args:
response: The requests Response object
expected_types: List of acceptable content types
strict: If True, raise exception on validation failure
Returns:
bool: True if content type is valid
Raises:
ContentTypeError: If strict=True and validation fails
"""
try:
content_type = response.headers.get('Content-Type', '').lower()
if not content_type:
logger.warning("No Content-Type header found in response")
if strict:
raise ContentTypeError("Missing Content-Type header")
return False
# Extract main content type (ignore parameters like charset)
main_type = content_type.split(';')[0].strip()
is_valid = any(expected in main_type for expected in expected_types)
if not is_valid:
error_msg = f"Content type '{main_type}' not in expected types {expected_types}"
logger.error(error_msg)
if strict:
raise ContentTypeError(error_msg)
return is_valid
except Exception as e:
logger.error(f"Error validating content type: {e}")
if strict:
raise
return False
# Usage examples
try:
response = requests.get("https://api.example.com/data")
# Non-strict validation
if safe_validate_content_type(response, ["application/json"]):
data = response.json()
# Strict validation (raises exception on failure)
safe_validate_content_type(response, ["application/json"], strict=True)
except ContentTypeError as e:
print(f"Content type validation failed: {e}")
except requests.RequestException as e:
print(f"Request failed: {e}")
Integration with Web Scraping Workflows
When building web scrapers, content type validation helps ensure you're processing the right kind of data. This is particularly important when handling dynamic content that loads after page load or when monitoring network requests in browser automation tools like Puppeteer.
Combining with Session Management
import requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)',
'Accept': 'application/json, text/html, application/xml'
})
def scrape_with_validation(urls: list):
results = []
for url in urls:
try:
response = session.get(url, timeout=10)
response.raise_for_status()
# Validate content type before processing
content_type = response.headers.get('Content-Type', '')
if 'application/json' in content_type:
data = response.json()
results.append({'url': url, 'type': 'json', 'data': data})
elif 'text/html' in content_type:
# Process HTML content
results.append({'url': url, 'type': 'html', 'content': response.text})
else:
logger.warning(f"Unexpected content type for {url}: {content_type}")
except requests.RequestException as e:
logger.error(f"Failed to fetch {url}: {e}")
return results
Conclusion
Validating response content types in Python Requests is essential for building reliable web scraping and API client applications. By implementing proper content type validation, you can:
- Prevent processing errors from unexpected data formats
- Implement appropriate error handling for different response types
- Build more robust and maintainable scraping applications
- Ensure data integrity in your processing pipeline
Remember to always combine content type validation with proper error handling and logging to create production-ready applications that can gracefully handle various scenarios and edge cases.