Mechanize is a Python library for stateful programmatic web browsing used to automate website interactions, form submissions, and data scraping. While powerful, Mechanize can encounter various errors during web scraping operations. Here's a comprehensive guide to the most common errors and their solutions.
HTTP Errors
1. HTTP Error 403: Forbidden
Cause: The server detects automated requests and blocks them, often due to missing or suspicious headers.
Solutions:
import mechanize
# Basic user agent setup
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')]
# More comprehensive browser simulation
br.set_handle_equiv(True)
br.set_handle_referer(True)
br.set_handle_redirect(True)
br.addheaders = [
('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.5'),
('Accept-Encoding', 'gzip, deflate'),
('Connection', 'keep-alive'),
]
2. HTTP Error 404: Not Found
Cause: The requested URL doesn't exist or has moved.
Solutions:
import mechanize
from urllib.error import HTTPError
br = mechanize.Browser()
try:
response = br.open('https://example.com/page')
except HTTPError as e:
if e.code == 404:
print("Page not found. Check the URL or try alternative paths.")
# Try common alternatives
alternative_urls = [
'https://example.com/page.html',
'https://example.com/page/',
'https://example.com/old-page'
]
for url in alternative_urls:
try:
response = br.open(url)
print(f"Found page at: {url}")
break
except HTTPError:
continue
3. HTTP Error 500: Internal Server Error
Cause: Server-side error, often triggered by malformed requests or server overload.
Solutions:
import time
import mechanize
from urllib.error import HTTPError
def retry_request(browser, url, max_retries=3, delay=2):
for attempt in range(max_retries):
try:
return browser.open(url)
except HTTPError as e:
if e.code == 500:
print(f"Server error on attempt {attempt + 1}. Retrying in {delay} seconds...")
time.sleep(delay)
delay *= 2 # Exponential backoff
else:
raise
raise Exception(f"Failed after {max_retries} attempts")
br = mechanize.Browser()
response = retry_request(br, 'https://example.com')
Form and Navigation Errors
4. FormNotFoundError
Cause: Mechanize cannot locate the specified form on the current page.
Solutions:
import mechanize
br = mechanize.Browser()
br.open('https://example.com/login')
# Debug available forms
print("Available forms:")
for i, form in enumerate(br.forms()):
print(f"Form {i}: {form.name or 'unnamed'}")
print(f" Action: {form.action}")
print(f" Method: {form.method}")
for control in form.controls:
print(f" Control: {control.name} ({control.type})")
# Select form by index (most reliable)
try:
br.select_form(nr=0) # Select first form
except mechanize.FormNotFoundError:
print("No forms found on this page")
# Select form by name or id
try:
br.select_form(name="login_form")
except mechanize.FormNotFoundError:
try:
br.select_form(id="loginForm")
except mechanize.FormNotFoundError:
print("Login form not found")
5. LinkNotFoundError
Cause: The specified link cannot be found on the current page.
Solutions:
import mechanize
br = mechanize.Browser()
br.open('https://example.com')
# Debug available links
print("Available links:")
for link in br.links():
print(f"Text: '{link.text}' | URL: {link.url}")
# Multiple ways to find and follow links
try:
# By text (exact match)
br.follow_link(text="Login")
except mechanize.LinkNotFoundError:
try:
# By partial text
br.follow_link(text_regex=r".*[Ll]ogin.*")
except mechanize.LinkNotFoundError:
try:
# By URL pattern
br.follow_link(url_regex=r".*/login.*")
except mechanize.LinkNotFoundError:
print("Login link not found")
State and Content Errors
6. BrowserStateError: not viewing HTML
Cause: Attempting HTML operations on non-HTML responses (images, JSON, etc.).
Solutions:
import mechanize
br = mechanize.Browser()
response = br.open('https://api.example.com/data.json')
# Check content type before processing
content_type = response.info().get('Content-Type', '').lower()
print(f"Content type: {content_type}")
if 'text/html' in content_type:
# Safe to use HTML methods
forms = list(br.forms())
links = list(br.links())
elif 'application/json' in content_type:
# Handle JSON response
import json
data = json.loads(response.read())
print(data)
else:
# Handle other content types
print(f"Received {content_type}, not HTML")
raw_content = response.read()
SSL and Security Errors
7. SSL Certificate Verification Errors
Cause: Invalid, expired, or self-signed SSL certificates.
Solutions:
import mechanize
import ssl
import urllib.request
# Option 1: Create unverified SSL context (use with caution)
def create_browser_with_ssl_bypass():
br = mechanize.Browser()
# Create unverified SSL context
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
# Install custom HTTPS handler
https_handler = urllib.request.HTTPSHandler(context=ssl_context)
br.add_handler(https_handler)
return br
# Option 2: Handle SSL errors gracefully
import urllib.error
br = mechanize.Browser()
try:
response = br.open('https://self-signed-example.com')
except urllib.error.URLError as e:
if 'SSL' in str(e) or 'CERTIFICATE' in str(e):
print("SSL certificate error. Consider using a different approach.")
# Implement alternative solution
else:
raise
Encoding and Text Processing Errors
8. Encoding and Unicode Errors
Cause: Mismatched character encodings between the website and your script.
Solutions:
import mechanize
from urllib.error import URLError
br = mechanize.Browser()
def safe_open_with_encoding(browser, url):
try:
response = browser.open(url)
# Get encoding from headers
content_type = response.info().get('Content-Type', '')
encoding = 'utf-8' # default
if 'charset=' in content_type:
encoding = content_type.split('charset=')[-1].strip()
# Read and decode content
raw_content = response.read()
try:
decoded_content = raw_content.decode(encoding)
except UnicodeDecodeError:
# Fallback to common encodings
for fallback_encoding in ['utf-8', 'iso-8859-1', 'windows-1252']:
try:
decoded_content = raw_content.decode(fallback_encoding)
print(f"Successfully decoded with {fallback_encoding}")
break
except UnicodeDecodeError:
continue
else:
# Last resort: decode with errors ignored
decoded_content = raw_content.decode('utf-8', errors='ignore')
print("Decoded with errors ignored")
return decoded_content
except URLError as e:
print(f"Error opening URL: {e}")
return None
# Usage
content = safe_open_with_encoding(br, 'https://example.com')
if content:
print("Successfully retrieved and decoded content")
Advanced Debugging Techniques
Comprehensive Error Handling and Debugging
import mechanize
import logging
from urllib.error import HTTPError, URLError
# Enable detailed logging
logging.basicConfig(level=logging.DEBUG)
def create_debug_browser():
br = mechanize.Browser()
# Enable all debugging
br.set_debug_http(True)
br.set_debug_redirects(True)
br.set_debug_responses(True)
# Configure browser behavior
br.set_handle_robots(False)
br.set_handle_equiv(True)
br.set_handle_referer(True)
br.set_handle_redirect(True)
return br
def robust_page_interaction(url, form_data=None):
br = create_debug_browser()
try:
# Open page
print(f"Opening: {url}")
response = br.open(url)
print(f"Response code: {response.code}")
print(f"Response headers: {response.info()}")
# Debug page content
print("\n=== PAGE ANALYSIS ===")
print(f"Title: {br.title()}")
print(f"URL after redirects: {br.geturl()}")
# List all forms
forms = list(br.forms())
print(f"\nFound {len(forms)} forms:")
for i, form in enumerate(forms):
print(f" Form {i}: {form.name} (action: {form.action})")
# List all links
links = list(br.links())
print(f"\nFound {len(links)} links:")
for link in links[:5]: # Show first 5 links
print(f" '{link.text}' -> {link.url}")
# Handle form submission if data provided
if form_data and forms:
br.select_form(nr=0)
for field, value in form_data.items():
try:
br[field] = value
print(f"Set {field} = {value}")
except Exception as e:
print(f"Could not set {field}: {e}")
response = br.submit()
print(f"Form submitted. New URL: {br.geturl()}")
return br
except HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}")
if hasattr(e, 'read'):
print(f"Error response: {e.read()}")
return None
except URLError as e:
print(f"URL Error: {e.reason}")
return None
except Exception as e:
print(f"Unexpected error: {type(e).__name__}: {e}")
return None
# Usage example
browser = robust_page_interaction(
'https://example.com/login',
{'username': 'user', 'password': 'pass'}
)
Best Practices for Error Prevention
- Always use try-except blocks for network operations
- Implement retry logic with exponential backoff for transient errors
- Set appropriate timeouts to avoid hanging requests
- Use browser simulation headers to avoid detection
- Check response content types before processing
- Enable debugging during development to understand request/response flow
- Handle encoding issues proactively with fallback mechanisms
- Validate URLs and form fields before attempting operations
By following these solutions and best practices, you can handle most common Mechanize errors effectively and build robust web scraping applications.