When building defensive web scraping tools for legitimate research or security analysis, mimicking human browsing behavior helps avoid triggering anti-bot systems. Here are professional techniques for creating robust, respectful scrapers:
Essential HTTP Headers
1. User-Agent Rotation
Rotate realistic User-Agent strings to simulate different browsers and devices:
import requests
import random
# Realistic User-Agent pool
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]
headers = {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get('https://www.homegate.ch/', headers=headers)
2. Complete Header Simulation
Include all typical browser headers:
def get_browser_headers():
return {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0'
}
Timing and Rate Limiting
3. Human-Like Request Patterns
Implement variable delays that mimic human reading patterns:
import time
import random
from typing import Tuple
def human_delay(base_delay: float = 2.0, variance: float = 1.5) -> None:
"""Simulate human reading time with realistic variance"""
delay = random.uniform(base_delay - variance, base_delay + variance)
delay = max(0.5, delay) # Minimum 0.5 seconds
time.sleep(delay)
def page_reading_delay(content_length: int) -> None:
"""Adjust delay based on content length (simulate reading time)"""
base_time = 2.0
reading_time = min(10.0, content_length / 1000) # ~1ms per character, max 10s
total_delay = base_time + reading_time + random.uniform(0, 2)
time.sleep(total_delay)
4. Adaptive Rate Limiting
Monitor response times and adjust request frequency:
class AdaptiveRateLimiter:
def __init__(self, base_delay: float = 1.0, max_delay: float = 10.0):
self.base_delay = base_delay
self.max_delay = max_delay
self.current_delay = base_delay
self.consecutive_errors = 0
def wait(self):
time.sleep(self.current_delay + random.uniform(0, 0.5))
def adjust_for_response(self, response_time: float, status_code: int):
if status_code == 429: # Too Many Requests
self.current_delay = min(self.max_delay, self.current_delay * 2)
self.consecutive_errors += 1
elif 200 <= status_code < 300:
if self.consecutive_errors > 0:
self.current_delay = max(self.base_delay, self.current_delay * 0.8)
self.consecutive_errors = 0
Session Management
5. Persistent Sessions with Cookies
Maintain session state like a real browser:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class HumanLikeScraper:
def __init__(self):
self.session = requests.Session()
self.setup_session()
def setup_session(self):
# Configure retry strategy
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# Set persistent headers
self.session.headers.update(get_browser_headers())
def navigate_with_referer(self, url: str, referer: str = None):
if referer:
self.session.headers.update({'Referer': referer})
response = self.session.get(url)
human_delay()
return response
Advanced Behavioral Simulation
6. Browser Automation with Selenium
For JavaScript-heavy sites, use Selenium with human-like interactions:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import random
def setup_human_like_driver():
options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
return driver
def human_like_scroll(driver, pause_time: Tuple[float, float] = (0.5, 2.0)):
"""Simulate human scrolling behavior"""
total_height = driver.execute_script("return document.body.scrollHeight")
viewport_height = driver.execute_script("return window.innerHeight")
current_position = 0
while current_position < total_height:
# Random scroll distance
scroll_distance = random.randint(100, 400)
current_position += scroll_distance
driver.execute_script(f"window.scrollTo(0, {current_position});")
time.sleep(random.uniform(*pause_time))
# Usage example
driver = setup_human_like_driver()
try:
driver.get("https://www.homegate.ch/")
human_like_scroll(driver)
# Simulate clicking with natural mouse movement
element = driver.find_element(By.CLASS_NAME, "search-button")
ActionChains(driver).move_to_element(element).pause(0.5).click().perform()
finally:
driver.quit()
Proxy and IP Management
7. Proxy Rotation with Health Checks
Implement robust proxy management:
import requests
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict
class ProxyManager:
def __init__(self, proxy_list: List[str]):
self.proxies = [{'http': proxy, 'https': proxy} for proxy in proxy_list]
self.healthy_proxies = []
self.check_proxy_health()
def check_proxy_health(self):
"""Test proxies and keep only working ones"""
def test_proxy(proxy):
try:
response = requests.get(
'http://httpbin.org/ip',
proxies=proxy,
timeout=10
)
return proxy if response.status_code == 200 else None
except:
return None
with ThreadPoolExecutor(max_workers=10) as executor:
results = executor.map(test_proxy, self.proxies)
self.healthy_proxies = [p for p in results if p is not None]
def get_random_proxy(self):
return random.choice(self.healthy_proxies) if self.healthy_proxies else None
Complete Example Implementation
import requests
from bs4 import BeautifulSoup
import time
import random
from typing import Optional
class HomegateScraperDefensive:
def __init__(self):
self.session = requests.Session()
self.rate_limiter = AdaptiveRateLimiter()
self.setup_session()
def setup_session(self):
self.session.headers.update(get_browser_headers())
def scrape_listings(self, search_url: str, max_pages: int = 5) -> List[Dict]:
"""Defensively scrape property listings"""
listings = []
for page in range(1, max_pages + 1):
try:
# Simulate navigation from search page
if page == 1:
self.session.headers.update({'Referer': 'https://www.homegate.ch/'})
else:
self.session.headers.update({'Referer': search_url})
# Add page parameter if needed
page_url = f"{search_url}?page={page}" if page > 1 else search_url
response = self.session.get(page_url)
self.rate_limiter.adjust_for_response(
response.elapsed.total_seconds(),
response.status_code
)
if response.status_code != 200:
print(f"Error {response.status_code} on page {page}")
break
# Parse listings
soup = BeautifulSoup(response.content, 'html.parser')
page_listings = self.extract_listings(soup)
listings.extend(page_listings)
print(f"Scraped {len(page_listings)} listings from page {page}")
# Human-like delay before next page
self.rate_limiter.wait()
page_reading_delay(len(response.text))
except Exception as e:
print(f"Error scraping page {page}: {e}")
time.sleep(5) # Wait longer on errors
return listings
def extract_listings(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract listing data from parsed HTML"""
listings = []
# Implementation depends on Homegate's current HTML structure
# This is a defensive example structure
for listing_elem in soup.find_all('div', class_=['listing-item', 'property-card']):
try:
listing = {
'title': self.safe_extract(listing_elem, 'h3, .title', 'text'),
'price': self.safe_extract(listing_elem, '.price', 'text'),
'location': self.safe_extract(listing_elem, '.location', 'text'),
'url': self.safe_extract(listing_elem, 'a', 'href')
}
if listing['title']: # Only add if we got basic data
listings.append(listing)
except Exception as e:
print(f"Error extracting listing: {e}")
continue
return listings
def safe_extract(self, element, selector: str, attribute: str) -> Optional[str]:
"""Safely extract data with fallback"""
try:
found = element.select_one(selector)
if found:
return found.get(attribute) if attribute != 'text' else found.get_text(strip=True)
except:
pass
return None
# Usage example with ethical considerations
if __name__ == "__main__":
# Always check robots.txt first
# Always respect rate limits
# Always have a legitimate use case
scraper = HomegateScraperDefensive()
# Example: Scrape rental properties in Zurich for market research
search_url = "https://www.homegate.ch/rent/real-estate/city-zurich/matching-list"
listings = scraper.scrape_listings(search_url, max_pages=3)
print(f"Successfully scraped {len(listings)} listings")
for listing in listings[:5]: # Show first 5
print(f"- {listing['title']} | {listing['price']} | {listing['location']}")
Legal and Ethical Guidelines
Critical Compliance Points:
- Check robots.txt: Always review
https://www.homegate.ch/robots.txt
- Terms of Service: Read and comply with Homegate's ToS
- Rate Limiting: Never overwhelm servers with requests
- Data Usage: Only collect data for legitimate purposes
- Privacy: Respect user privacy and data protection laws (GDPR)
Best Practices Summary
- Start Simple: Begin with basic HTTP requests before using browser automation
- Monitor Response: Watch for 429 (rate limit) or 403 (forbidden) responses
- Respect Delays: Implement progressive backoff on errors
- Session Persistence: Maintain cookies and session state
- Error Handling: Gracefully handle failures and network issues
- Logging: Keep detailed logs for debugging and compliance
Note: This guide is for educational and defensive security purposes only. Always ensure your scraping activities comply with website terms of service, applicable laws, and ethical guidelines. Use these techniques responsibly for legitimate research, security testing, or business intelligence purposes.