Can MechanicalSoup Handle POST requests with JSON Data?
Yes, MechanicalSoup can handle POST requests with JSON data, though it requires a slightly different approach than form-based submissions. While MechanicalSoup is primarily designed for HTML forms and browser automation, it provides the flexibility to send raw HTTP requests with JSON payloads through its underlying requests session.
Understanding MechanicalSoup's HTTP Capabilities
MechanicalSoup is built on top of the popular requests
library, which means it inherits all the HTTP functionality that requests provides. This includes the ability to send POST requests with custom headers, data formats, and content types including JSON.
Basic JSON POST Request with MechanicalSoup
Here's how to send a POST request with JSON data using MechanicalSoup:
import mechanicalsoup
import json
# Create a MechanicalSoup browser instance
browser = mechanicalsoup.StatefulBrowser()
# Prepare your JSON data
json_data = {
"username": "john_doe",
"email": "john@example.com",
"preferences": {
"theme": "dark",
"notifications": True
}
}
# Send POST request with JSON data
response = browser.session.post(
'https://api.example.com/users',
json=json_data,
headers={'Content-Type': 'application/json'}
)
# Check the response
if response.status_code == 200:
print("JSON POST request successful!")
print(response.json())
else:
print(f"Request failed with status code: {response.status_code}")
Advanced JSON POST Examples
Sending JSON with Authentication Headers
import mechanicalsoup
import json
browser = mechanicalsoup.StatefulBrowser()
# Set authentication headers
headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer your-access-token',
'User-Agent': 'MechanicalSoup/1.0'
}
# Complex JSON payload
payload = {
"action": "create_order",
"items": [
{"id": 1, "quantity": 2, "price": 29.99},
{"id": 2, "quantity": 1, "price": 15.50}
],
"shipping": {
"address": "123 Main St",
"city": "New York",
"zip": "10001"
},
"metadata": {
"source": "web_scraper",
"timestamp": "2024-01-15T10:30:00Z"
}
}
response = browser.session.post(
'https://api.ecommerce.com/orders',
json=payload,
headers=headers,
timeout=30
)
print(f"Response status: {response.status_code}")
print(f"Response body: {response.text}")
Handling JSON Responses
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
# Send JSON data and handle the response
json_payload = {"query": "web scraping tools", "limit": 10}
response = browser.session.post(
'https://api.search.com/v1/search',
json=json_payload
)
try:
# Parse JSON response
result_data = response.json()
# Process the results
if 'results' in result_data:
for item in result_data['results']:
print(f"Title: {item.get('title')}")
print(f"URL: {item.get('url')}")
except json.JSONDecodeError:
print("Response is not valid JSON")
print(f"Raw response: {response.text}")
Comparison with Pure Requests Library
While MechanicalSoup can handle JSON POST requests, it's worth comparing it with the pure requests library approach:
Using Requests Directly
import requests
import json
# Pure requests approach
json_data = {"name": "test", "value": 42}
response = requests.post(
'https://api.example.com/data',
json=json_data,
headers={'Content-Type': 'application/json'}
)
Using MechanicalSoup's Session
import mechanicalsoup
# MechanicalSoup approach
browser = mechanicalsoup.StatefulBrowser()
json_data = {"name": "test", "value": 42}
response = browser.session.post(
'https://api.example.com/data',
json=json_data,
headers={'Content-Type': 'application/json'}
)
The main advantage of using MechanicalSoup is that it maintains session state, handles cookies automatically, and provides additional browser-like functionality when needed.
Error Handling and Best Practices
Robust Error Handling
import mechanicalsoup
import json
from requests.exceptions import RequestException, Timeout, ConnectionError
browser = mechanicalsoup.StatefulBrowser()
def send_json_post(url, data, headers=None, timeout=30):
"""
Send a POST request with JSON data and handle common errors.
"""
default_headers = {'Content-Type': 'application/json'}
if headers:
default_headers.update(headers)
try:
response = browser.session.post(
url,
json=data,
headers=default_headers,
timeout=timeout
)
# Raise an exception for bad status codes
response.raise_for_status()
return response
except Timeout:
print(f"Request to {url} timed out")
return None
except ConnectionError:
print(f"Failed to connect to {url}")
return None
except RequestException as e:
print(f"Request failed: {e}")
return None
# Usage example
data = {"user_id": 123, "action": "update_profile"}
response = send_json_post('https://api.example.com/users', data)
if response:
print("Success:", response.json())
Session Management with JSON APIs
import mechanicalsoup
class APIClient:
def __init__(self, base_url):
self.browser = mechanicalsoup.StatefulBrowser()
self.base_url = base_url
self.authenticated = False
def login(self, username, password):
"""Authenticate with the API using JSON credentials."""
login_data = {
"username": username,
"password": password
}
response = self.browser.session.post(
f"{self.base_url}/auth/login",
json=login_data,
headers={'Content-Type': 'application/json'}
)
if response.status_code == 200:
self.authenticated = True
# Token might be in response or cookies
token_data = response.json()
if 'access_token' in token_data:
self.browser.session.headers.update({
'Authorization': f"Bearer {token_data['access_token']}"
})
return True
return False
def send_data(self, endpoint, data):
"""Send JSON data to an API endpoint."""
if not self.authenticated:
raise Exception("Must authenticate first")
response = self.browser.session.post(
f"{self.base_url}{endpoint}",
json=data,
headers={'Content-Type': 'application/json'}
)
return response
# Usage
client = APIClient('https://api.example.com')
if client.login('user@example.com', 'password123'):
result = client.send_data('/users/profile', {
"name": "John Doe",
"preferences": {"theme": "dark"}
})
print(result.json())
When to Use MechanicalSoup vs. Alternatives
Use MechanicalSoup When:
- Mixed workflows: You need both form handling and JSON API interactions
- Session persistence: You want automatic cookie and session management
- Browser emulation: You need to simulate browser behavior alongside API calls
- Complex authentication: The site uses both forms and API tokens
Consider Alternatives When:
- Pure API work: If you're only working with JSON APIs, pure
requests
might be simpler - Performance: For high-volume API calls, requests or
aiohttp
might be faster - Advanced features: For complex HTTP scenarios,
httpx
offers more modern features
Integration with Web Scraping Workflows
MechanicalSoup's JSON POST capability becomes particularly useful when you need to handle authentication in web scraping workflows, where you might authenticate via API calls before scraping protected content:
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
# Step 1: Authenticate via JSON API
auth_response = browser.session.post(
'https://example.com/api/auth',
json={"username": "scraper_user", "password": "secret"}
)
if auth_response.status_code == 200:
# Step 2: Use the authenticated session for scraping
page = browser.get('https://example.com/protected-data')
# Step 3: Extract data using MechanicalSoup's parsing
soup = page.soup
data_elements = soup.find_all('div', class_='data-item')
for element in data_elements:
print(element.get_text().strip())
Using JSON APIs for Dynamic Content Loading
Similar to how you might handle AJAX requests using Puppeteer, MechanicalSoup can interact with the same AJAX endpoints directly:
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
# Navigate to the main page first to establish session
browser.get('https://example.com/products')
# Make AJAX request that the page would normally make
ajax_data = {
"category": "electronics",
"sort": "price_asc",
"page": 1
}
ajax_response = browser.session.post(
'https://example.com/api/products/search',
json=ajax_data,
headers={
'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'
}
)
if ajax_response.status_code == 200:
products = ajax_response.json()
for product in products['items']:
print(f"{product['name']}: ${product['price']}")
Troubleshooting Common Issues
Content-Type Headers
Always ensure you're setting the correct Content-Type header:
# Correct approach
response = browser.session.post(
url,
json=data, # This automatically sets Content-Type to application/json
headers={'Authorization': 'Bearer token'}
)
# Alternative explicit approach
import json
response = browser.session.post(
url,
data=json.dumps(data),
headers={
'Content-Type': 'application/json',
'Authorization': 'Bearer token'
}
)
Handling Different Response Types
response = browser.session.post(url, json=data)
# Check content type before parsing
content_type = response.headers.get('Content-Type', '')
if 'application/json' in content_type:
result = response.json()
elif 'text/html' in content_type:
# Parse as HTML if needed
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
else:
# Handle other content types
result = response.text
Working with CSRF Tokens
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
# Get the page with CSRF token first
page = browser.get('https://example.com/form-page')
# Extract CSRF token from the page
csrf_token = page.soup.find('meta', {'name': 'csrf-token'})['content']
# Include CSRF token in JSON request
json_data = {
"action": "update_settings",
"csrf_token": csrf_token,
"data": {"theme": "dark", "notifications": True}
}
response = browser.session.post(
'https://example.com/api/settings',
json=json_data,
headers={'Content-Type': 'application/json'}
)
Performance Considerations
Connection Pooling
MechanicalSoup automatically handles connection pooling through its underlying requests session:
import mechanicalsoup
# The browser instance reuses connections automatically
browser = mechanicalsoup.StatefulBrowser()
# Multiple requests will reuse the same connection when possible
for i in range(10):
data = {"batch_id": i, "items": [1, 2, 3]}
response = browser.session.post(
'https://api.example.com/batch',
json=data
)
print(f"Batch {i}: {response.status_code}")
Async Alternatives
For high-performance scenarios, consider async alternatives:
import asyncio
import aiohttp
async def send_json_async(session, url, data):
async with session.post(url, json=data) as response:
return await response.json()
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for i in range(100):
data = {"request_id": i}
task = send_json_async(session, 'https://api.example.com/data', data)
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
# For CPU-bound work, stick with MechanicalSoup
# For I/O-bound work with many concurrent requests, consider aiohttp
Conclusion
MechanicalSoup can definitely handle POST requests with JSON data, leveraging its underlying requests session. While it may not be the most lightweight option for pure API interactions, it excels in scenarios where you need to combine web scraping with API calls in a single workflow. The key is understanding when to use MechanicalSoup's browser-like features versus direct session access for JSON operations.
For developers working on complex scraping projects that involve both form interactions and API calls, MechanicalSoup provides a unified interface that maintains session state and simplifies authentication workflows across different interaction types.