How do I handle CAPTCHAs when scraping websites with Python?
CAPTCHAs (Completely Automated Public Turing tests to tell Computers and Humans Apart) are one of the most common anti-bot measures websites use to prevent automated scraping. While CAPTCHAs are designed to block bots, there are several legitimate strategies you can employ when scraping websites with Python.
Understanding CAPTCHA Types
Before implementing solutions, it's important to understand the different types of CAPTCHAs you might encounter:
1. Image-based CAPTCHAs
- Text CAPTCHAs: Distorted text that needs to be typed
- reCAPTCHA v2: "I'm not a robot" checkbox with image challenges
- Object recognition: "Select all images with cars"
2. Behavioral CAPTCHAs
- reCAPTCHA v3: Invisible scoring based on user behavior
- hCaptcha: Similar to reCAPTCHA but privacy-focused
- Mouse movement tracking: Analyzing human-like cursor patterns
3. Logic-based CAPTCHAs
- Math problems: Simple arithmetic questions
- Text questions: "What color is the sky?"
Strategy 1: CAPTCHA Avoidance
The best approach is to avoid triggering CAPTCHAs altogether by mimicking human behavior.
Implement Proper Request Headers
import requests
import time
import random
from fake_useragent import UserAgent
class StealthScraper:
def __init__(self):
self.session = requests.Session()
self.ua = UserAgent()
def get_headers(self):
return {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
def scrape_with_delays(self, urls):
for url in urls:
# Random delay between requests
time.sleep(random.uniform(2, 5))
response = self.session.get(url, headers=self.get_headers())
if response.status_code == 200:
yield response.text
else:
print(f"Failed to scrape {url}: {response.status_code}")
# Usage
scraper = StealthScraper()
urls = ['https://example.com/page1', 'https://example.com/page2']
for content in scraper.scrape_with_delays(urls):
# Process the content
pass
Use Selenium with Human-like Behavior
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import random
class HumanLikeScraper:
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
def human_like_scroll(self):
"""Simulate human scrolling behavior"""
total_height = self.driver.execute_script("return document.body.scrollHeight")
current_position = 0
while current_position < total_height:
# Random scroll distance
scroll_distance = random.randint(100, 300)
current_position += scroll_distance
self.driver.execute_script(f"window.scrollTo(0, {current_position});")
# Random pause
time.sleep(random.uniform(0.5, 1.5))
def human_like_click(self, element):
"""Simulate human clicking with mouse movement"""
actions = ActionChains(self.driver)
# Move to element with slight randomness
actions.move_to_element_with_offset(
element,
random.randint(-5, 5),
random.randint(-5, 5)
)
# Small delay before click
time.sleep(random.uniform(0.1, 0.3))
actions.click().perform()
def scrape_page(self, url):
self.driver.get(url)
# Simulate reading time
time.sleep(random.uniform(3, 7))
# Human-like scrolling
self.human_like_scroll()
return self.driver.page_source
def close(self):
self.driver.quit()
Strategy 2: Using CAPTCHA Solving Services
When avoidance isn't possible, you can use automated CAPTCHA solving services.
2captcha Integration
import requests
import time
import base64
from io import BytesIO
from PIL import Image
class TwoCaptchaSolver:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "http://2captcha.com"
def solve_image_captcha(self, image_path):
"""Solve image-based CAPTCHA"""
# Read and encode image
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
# Submit CAPTCHA
submit_url = f"{self.base_url}/in.php"
submit_data = {
'method': 'base64',
'key': self.api_key,
'body': image_data
}
response = requests.post(submit_url, data=submit_data)
if response.text.startswith('OK|'):
captcha_id = response.text.split('|')[1]
return self._get_result(captcha_id)
else:
raise Exception(f"Failed to submit CAPTCHA: {response.text}")
def solve_recaptcha_v2(self, site_key, page_url):
"""Solve reCAPTCHA v2"""
submit_url = f"{self.base_url}/in.php"
submit_data = {
'method': 'userrecaptcha',
'key': self.api_key,
'googlekey': site_key,
'pageurl': page_url
}
response = requests.post(submit_url, data=submit_data)
if response.text.startswith('OK|'):
captcha_id = response.text.split('|')[1]
return self._get_result(captcha_id)
else:
raise Exception(f"Failed to submit reCAPTCHA: {response.text}")
def _get_result(self, captcha_id):
"""Poll for CAPTCHA solution"""
result_url = f"{self.base_url}/res.php"
for _ in range(30): # Wait up to 5 minutes
time.sleep(10)
response = requests.get(result_url, params={
'key': self.api_key,
'action': 'get',
'id': captcha_id
})
if response.text == 'CAPCHA_NOT_READY':
continue
elif response.text.startswith('OK|'):
return response.text.split('|')[1]
else:
raise Exception(f"CAPTCHA solving failed: {response.text}")
raise Exception("CAPTCHA solving timeout")
# Usage example with Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
def scrape_with_captcha_solving():
solver = TwoCaptchaSolver('YOUR_API_KEY')
driver = webdriver.Chrome()
try:
driver.get('https://example.com/protected-page')
# Check if CAPTCHA is present
try:
captcha_element = driver.find_element(By.CLASS_NAME, 'g-recaptcha')
site_key = captcha_element.get_attribute('data-sitekey')
# Solve CAPTCHA
solution = solver.solve_recaptcha_v2(site_key, driver.current_url)
# Inject solution
driver.execute_script(f'''
document.getElementById("g-recaptcha-response").innerHTML = "{solution}";
document.getElementById("g-recaptcha-response").style.display = "block";
''')
# Submit form
submit_button = driver.find_element(By.XPATH, "//input[@type='submit']")
submit_button.click()
except:
print("No CAPTCHA found, proceeding normally")
# Continue with scraping
return driver.page_source
finally:
driver.quit()
Strategy 3: Using Residential Proxies and IP Rotation
Rotating IP addresses can help avoid CAPTCHA triggers by distributing requests across different IPs.
import requests
import random
import time
class ProxyRotator:
def __init__(self, proxy_list):
self.proxies = proxy_list
self.current_proxy_index = 0
def get_next_proxy(self):
"""Get next proxy in rotation"""
proxy = self.proxies[self.current_proxy_index]
self.current_proxy_index = (self.current_proxy_index + 1) % len(self.proxies)
return {
'http': f'http://{proxy}',
'https': f'https://{proxy}'
}
def scrape_with_rotation(self, urls):
session = requests.Session()
for url in urls:
proxy = self.get_next_proxy()
try:
response = session.get(
url,
proxies=proxy,
timeout=10,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
)
if response.status_code == 200:
yield response.text
except requests.RequestException as e:
print(f"Error with proxy {proxy}: {e}")
continue
# Delay between requests
time.sleep(random.uniform(1, 3))
# Usage
proxy_list = [
'proxy1.example.com:8080',
'proxy2.example.com:8080',
'proxy3.example.com:8080'
]
rotator = ProxyRotator(proxy_list)
urls = ['https://example.com/page1', 'https://example.com/page2']
for content in rotator.scrape_with_rotation(urls):
# Process scraped content
pass
Strategy 4: Browser Automation with CAPTCHA Handling
For complex scenarios, you might need to combine browser automation with manual intervention or solving services.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class AdvancedCaptchaHandler:
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def handle_recaptcha_v2(self, solver_api_key):
"""Handle reCAPTCHA v2 with automated solving"""
try:
# Wait for reCAPTCHA to load
recaptcha_frame = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "iframe[src*='recaptcha']"))
)
# Get site key
site_key = self.driver.find_element(
By.CSS_SELECTOR, "[data-sitekey]"
).get_attribute("data-sitekey")
# Use solving service (example with 2captcha)
solver = TwoCaptchaSolver(solver_api_key)
solution = solver.solve_recaptcha_v2(site_key, self.driver.current_url)
# Inject solution
self.driver.execute_script(f'''
document.getElementById("g-recaptcha-response").innerHTML = "{solution}";
if (typeof ___grecaptcha_cfg !== 'undefined') {{
for (var cid in ___grecaptcha_cfg.clients) {{
___grecaptcha_cfg.clients[cid].callback('{solution}');
}}
}}
''')
return True
except Exception as e:
print(f"Failed to handle reCAPTCHA: {e}")
return False
def handle_hcaptcha(self, solver_api_key):
"""Handle hCaptcha challenges"""
try:
# Detect hCaptcha
hcaptcha_element = self.driver.find_element(
By.CSS_SELECTOR, ".h-captcha"
)
site_key = hcaptcha_element.get_attribute("data-sitekey")
# Solve using service (implementation depends on service)
# This is a simplified example
print(f"hCaptcha detected with site key: {site_key}")
# Wait for manual solving or implement automated solution
input("Please solve the hCaptcha manually and press Enter to continue...")
return True
except:
return False
def scrape_protected_site(self, url, solver_api_key=None):
"""Scrape a site that may have CAPTCHAs"""
self.driver.get(url)
# Check for various CAPTCHA types
captcha_solved = False
# Check for reCAPTCHA v2
if self._element_exists("iframe[src*='recaptcha']"):
print("reCAPTCHA v2 detected")
if solver_api_key:
captcha_solved = self.handle_recaptcha_v2(solver_api_key)
else:
input("Please solve the reCAPTCHA manually and press Enter...")
captcha_solved = True
# Check for hCaptcha
elif self._element_exists(".h-captcha"):
print("hCaptcha detected")
captcha_solved = self.handle_hcaptcha(solver_api_key)
# If CAPTCHA was solved or not present, continue scraping
if captcha_solved or not self._has_captcha():
return self.driver.page_source
else:
print("Could not solve CAPTCHA")
return None
def _element_exists(self, selector):
"""Check if element exists"""
try:
self.driver.find_element(By.CSS_SELECTOR, selector)
return True
except:
return False
def _has_captcha(self):
"""Check if page contains any known CAPTCHA"""
captcha_indicators = [
"iframe[src*='recaptcha']",
".h-captcha",
".captcha",
"[data-sitekey]"
]
for indicator in captcha_indicators:
if self._element_exists(indicator):
return True
return False
def close(self):
self.driver.quit()
Strategy 5: API-Based Solutions
Sometimes, the most effective approach is to use a professional web scraping API that handles CAPTCHAs automatically.
import requests
class WebScrapingAPI:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.webscraping.ai/html"
def scrape_with_captcha_handling(self, url, **kwargs):
"""Scrape URL with automatic CAPTCHA handling"""
params = {
'api_key': self.api_key,
'url': url,
'js': 'true', # Enable JavaScript rendering
'proxy': 'datacenter', # Use datacenter proxy
**kwargs
}
response = requests.get(self.base_url, params=params)
if response.status_code == 200:
return response.text
else:
raise Exception(f"API request failed: {response.status_code} - {response.text}")
# Usage
api = WebScrapingAPI('your_api_key')
html_content = api.scrape_with_captcha_handling('https://example.com/protected-page')
Best Practices and Legal Considerations
Ethical Scraping Guidelines
- Respect robots.txt: Always check and follow the website's robots.txt file
- Rate limiting: Don't overwhelm servers with rapid requests
- Terms of service: Review and comply with website terms of service
- Data usage: Use scraped data responsibly and in compliance with privacy laws
Performance Optimization
import concurrent.futures
import asyncio
import aiohttp
async def async_scrape_with_session(session, url):
"""Asynchronous scraping for better performance"""
async with session.get(url) as response:
return await response.text()
async def scrape_multiple_urls(urls):
"""Scrape multiple URLs concurrently"""
async with aiohttp.ClientSession() as session:
tasks = [async_scrape_with_session(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return results
# Usage
urls = ['https://example.com/page1', 'https://example.com/page2']
results = asyncio.run(scrape_multiple_urls(urls))
Monitoring and Error Handling
import logging
from datetime import datetime
class CaptchaAwareSpider:
def __init__(self):
self.setup_logging()
self.captcha_encounters = 0
self.success_rate = 0
def setup_logging(self):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('scraping.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def scrape_with_monitoring(self, url):
"""Scrape with comprehensive monitoring"""
start_time = datetime.now()
try:
# Your scraping logic here
response = requests.get(url)
# Check for CAPTCHA indicators in response
if self._contains_captcha(response.text):
self.captcha_encounters += 1
self.logger.warning(f"CAPTCHA detected on {url}")
return None
duration = (datetime.now() - start_time).total_seconds()
self.logger.info(f"Successfully scraped {url} in {duration}s")
return response.text
except Exception as e:
self.logger.error(f"Failed to scrape {url}: {e}")
return None
def _contains_captcha(self, html):
"""Detect CAPTCHA presence in HTML"""
captcha_keywords = [
'recaptcha', 'hcaptcha', 'captcha',
'prove you are human', 'verification required'
]
html_lower = html.lower()
return any(keyword in html_lower for keyword in captcha_keywords)
Browser Automation Tools Comparison
When dealing with JavaScript-heavy sites that present CAPTCHAs, you might consider using different browser automation tools. While Python's Selenium is powerful, there are other options worth exploring for specific scenarios:
Using Puppeteer with Python
Although Puppeteer is primarily a Node.js library, you can integrate it with Python through pyppeteer
:
import asyncio
from pyppeteer import launch
class PuppeteerScraper:
async def scrape_with_puppeteer(self, url):
browser = await launch(headless=False)
page = await browser.newPage()
# Set user agent
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
await page.goto(url)
# Wait for potential CAPTCHAs to load
await page.waitFor(2000)
# Check for CAPTCHA presence
recaptcha_present = await page.evaluate('''
() => document.querySelector("iframe[src*='recaptcha']") !== null
''')
if recaptcha_present:
print("CAPTCHA detected - manual intervention required")
# Pause for manual solving
input("Solve CAPTCHA manually and press Enter...")
content = await page.content()
await browser.close()
return content
# Usage
async def main():
scraper = PuppeteerScraper()
content = await scraper.scrape_with_puppeteer('https://example.com')
print(content)
# Run the scraper
asyncio.run(main())
For more advanced browser automation techniques, you might find it helpful to understand how to handle authentication in Puppeteer or learn about handling pop-ups and modals in Puppeteer when dealing with complex CAPTCHA scenarios.
Advanced CAPTCHA Detection
import cv2
import numpy as np
from PIL import Image
import io
import base64
class CaptchaDetector:
def __init__(self):
self.common_captcha_patterns = [
'g-recaptcha',
'h-captcha',
'captcha-container',
'verification-challenge'
]
def detect_visual_captcha(self, screenshot_data):
"""Detect CAPTCHA using computer vision"""
# Convert base64 to image
image_data = base64.b64decode(screenshot_data)
image = Image.open(io.BytesIO(image_data))
# Convert to OpenCV format
opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
# Look for CAPTCHA-like patterns
# This is a simplified example - real implementation would be more complex
gray = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2GRAY)
# Detect text regions that might be CAPTCHAs
contours, _ = cv2.findContours(gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = w / h
# Typical CAPTCHA dimensions
if 2 < aspect_ratio < 8 and w > 100 and h > 30:
return True, (x, y, w, h)
return False, None
def detect_html_captcha(self, html_content):
"""Detect CAPTCHA in HTML content"""
html_lower = html_content.lower()
for pattern in self.common_captcha_patterns:
if pattern in html_lower:
return True, pattern
# Check for common CAPTCHA-related keywords
captcha_keywords = [
'solve the puzzle',
'verify you are human',
'complete the challenge',
'security check'
]
for keyword in captcha_keywords:
if keyword in html_lower:
return True, keyword
return False, None
# Usage with Selenium
def enhanced_captcha_detection():
detector = CaptchaDetector()
driver = webdriver.Chrome()
try:
driver.get('https://example.com')
# Take screenshot for visual detection
screenshot = driver.get_screenshot_as_base64()
visual_captcha, coords = detector.detect_visual_captcha(screenshot)
# Check HTML for CAPTCHA elements
html_captcha, pattern = detector.detect_html_captcha(driver.page_source)
if visual_captcha or html_captcha:
print(f"CAPTCHA detected - Visual: {visual_captcha}, HTML: {html_captcha}")
if pattern:
print(f"Pattern found: {pattern}")
if coords:
print(f"Visual CAPTCHA coordinates: {coords}")
# Handle CAPTCHA accordingly
return handle_detected_captcha(driver)
else:
print("No CAPTCHA detected")
return driver.page_source
finally:
driver.quit()
Conclusion
Handling CAPTCHAs in Python web scraping requires a multi-faceted approach. The key strategies include:
- Prevention first: Use human-like behavior patterns and proper request handling
- Automated solving: Integrate with CAPTCHA solving services for persistent challenges
- Technical solutions: Employ proxy rotation and advanced browser automation
- Professional tools: Consider using specialized APIs that handle CAPTCHAs automatically
When implementing these solutions, always ensure you're scraping ethically and in compliance with website terms of service and applicable laws. For complex scenarios or large-scale operations, professional web scraping services often provide the most reliable and efficient solution.
Remember that while these techniques can help handle CAPTCHAs, they should be used responsibly and in accordance with the website's robots.txt file and terms of service. Consider reaching out to website owners for API access when available, as this is often the most sustainable approach for long-term data collection needs.