⚠️ Important Legal Disclaimer
Before attempting to scrape Zillow, you must understand the legal implications:
- Zillow's Terms of Service explicitly prohibit automated data collection
- robots.txt restrictions at
https://www.zillow.com/robots.txt
may block scraping - Legal consequences can include cease-and-desist orders, IP bans, or lawsuits
- Rate limiting violations can be considered denial-of-service attacks
Always check for official APIs first and consider alternatives like Zillow's Partner API for commercial use.
Why Zillow is Challenging to Scrape
Zillow implements several anti-scraping measures:
- Dynamic content loading via JavaScript
- CAPTCHA challenges for suspicious activity
- IP-based rate limiting and blocking
- Frequent HTML structure changes
- Bot detection systems
Method 1: Python with Requests + BeautifulSoup
Installation
pip install requests beautifulsoup4 lxml
Basic Implementation
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin
class ZillowScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def search_properties(self, location, max_pages=3):
"""Search for properties in a specific location"""
base_url = f"https://www.zillow.com/{location}/"
properties = []
for page in range(1, max_pages + 1):
try:
# Add random delay between requests
time.sleep(random.uniform(2, 5))
url = f"{base_url}{page}_p/" if page > 1 else base_url
response = self.session.get(url, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
page_properties = self.extract_properties(soup)
properties.extend(page_properties)
print(f"Page {page}: Found {len(page_properties)} properties")
else:
print(f"Failed to fetch page {page}: {response.status_code}")
except Exception as e:
print(f"Error scraping page {page}: {e}")
return properties
def extract_properties(self, soup):
"""Extract property data from BeautifulSoup object"""
properties = []
# Note: These selectors are examples and may not match current Zillow structure
property_cards = soup.find_all('article', {'data-test': 'property-card'})
for card in property_cards:
try:
property_data = {
'price': self.safe_extract(card, '[data-test="property-card-price"]'),
'address': self.safe_extract(card, '[data-test="property-card-addr"]'),
'beds': self.safe_extract(card, '[data-test="property-card-beds"]'),
'baths': self.safe_extract(card, '[data-test="property-card-baths"]'),
'sqft': self.safe_extract(card, '[data-test="property-card-sqft"]'),
'property_type': self.safe_extract(card, '[data-test="property-card-type"]'),
'link': self.extract_property_link(card)
}
# Only add if we have essential data
if property_data['price'] and property_data['address']:
properties.append(property_data)
except Exception as e:
print(f"Error extracting property: {e}")
continue
return properties
def safe_extract(self, element, selector):
"""Safely extract text from element"""
try:
found = element.select_one(selector)
return found.get_text(strip=True) if found else None
except:
return None
def extract_property_link(self, card):
"""Extract property detail page link"""
try:
link_element = card.find('a', href=True)
if link_element:
return urljoin('https://www.zillow.com', link_element['href'])
except:
pass
return None
# Usage example
if __name__ == "__main__":
scraper = ZillowScraper()
# Search for properties in Seattle (adjust location format as needed)
properties = scraper.search_properties("seattle-wa", max_pages=2)
print(f"\nFound {len(properties)} properties:")
for prop in properties[:5]: # Show first 5
print(f"Price: {prop['price']}")
print(f"Address: {prop['address']}")
print(f"Beds/Baths: {prop['beds']}/{prop['baths']}")
print(f"Link: {prop['link']}")
print("-" * 50)
Method 2: JavaScript with Puppeteer
Installation
npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
Enhanced Implementation
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// Use stealth plugin to avoid detection
puppeteer.use(StealthPlugin());
class ZillowScraper {
constructor() {
this.browser = null;
this.page = null;
}
async init() {
this.browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-features=VizDisplayCompositor'
]
});
this.page = await this.browser.newPage();
// Set realistic viewport
await this.page.setViewport({ width: 1366, height: 768 });
// Set user agent
await this.page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
// Block images and stylesheets to speed up scraping
await this.page.setRequestInterception(true);
this.page.on('request', (req) => {
if (req.resourceType() === 'image' || req.resourceType() === 'stylesheet') {
req.abort();
} else {
req.continue();
}
});
}
async searchProperties(location, maxPages = 3) {
if (!this.page) await this.init();
const allProperties = [];
for (let page = 1; page <= maxPages; page++) {
try {
const url = `https://www.zillow.com/${location}/${page}_p/`;
console.log(`Scraping page ${page}: ${url}`);
await this.page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for listings to load
await this.page.waitForSelector('[data-test="property-card"]', { timeout: 10000 });
// Extract property data
const properties = await this.page.evaluate(() => {
const cards = document.querySelectorAll('[data-test="property-card"]');
const results = [];
cards.forEach(card => {
try {
const property = {
price: card.querySelector('[data-test="property-card-price"]')?.textContent?.trim(),
address: card.querySelector('[data-test="property-card-addr"]')?.textContent?.trim(),
beds: card.querySelector('[data-test="property-card-beds"]')?.textContent?.trim(),
baths: card.querySelector('[data-test="property-card-baths"]')?.textContent?.trim(),
sqft: card.querySelector('[data-test="property-card-sqft"]')?.textContent?.trim(),
link: card.querySelector('a')?.href
};
if (property.price && property.address) {
results.push(property);
}
} catch (e) {
console.log('Error extracting property:', e);
}
});
return results;
});
allProperties.push(...properties);
console.log(`Found ${properties.length} properties on page ${page}`);
// Random delay between pages
await this.page.waitForTimeout(Math.random() * 3000 + 2000);
} catch (error) {
console.error(`Error on page ${page}:`, error.message);
}
}
return allProperties;
}
async close() {
if (this.browser) {
await this.browser.close();
}
}
}
// Usage example
(async () => {
const scraper = new ZillowScraper();
try {
const properties = await scraper.searchProperties('seattle-wa', 2);
console.log(`\nTotal properties found: ${properties.length}`);
properties.slice(0, 5).forEach((prop, index) => {
console.log(`\n${index + 1}. ${prop.address}`);
console.log(` Price: ${prop.price}`);
console.log(` Specs: ${prop.beds} beds, ${prop.baths} baths, ${prop.sqft}`);
console.log(` Link: ${prop.link}`);
});
} catch (error) {
console.error('Scraping failed:', error);
} finally {
await scraper.close();
}
})();
Method 3: Using WebScraping.AI
For a more reliable solution, consider using a web scraping API:
import requests
def scrape_with_webscraping_ai(url, api_key):
"""Use WebScraping.AI to scrape Zillow with better success rates"""
endpoint = f"https://api.webscraping.ai/html"
params = {
'api_key': api_key,
'url': url,
'js': 'true', # Execute JavaScript
'proxy': 'residential', # Use residential proxies
'device': 'desktop',
'timeout': 15000
}
response = requests.get(endpoint, params=params)
if response.status_code == 200:
return response.text
else:
raise Exception(f"API request failed: {response.status_code}")
# Usage
# html_content = scrape_with_webscraping_ai(
# 'https://www.zillow.com/seattle-wa/',
# 'your_api_key_here'
# )
Best Practices and Tips
1. Avoid Detection
- Rotate User Agents regularly
- Use residential proxies for different IP addresses
- Add random delays between requests (2-5 seconds)
- Limit concurrent requests (max 1-2 simultaneously)
2. Handle Dynamic Content
- Wait for JavaScript to load content
- Use browser automation (Puppeteer/Selenium) for JS-heavy pages
- Monitor network requests to find API endpoints
3. Error Handling
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def create_session_with_retries():
session = requests.Session()
# Configure retry strategy
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
4. Data Storage
import json
import csv
from datetime import datetime
def save_properties(properties, format='json'):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if format == 'json':
filename = f"zillow_properties_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(properties, f, indent=2)
elif format == 'csv':
filename = f"zillow_properties_{timestamp}.csv"
if properties:
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=properties[0].keys())
writer.writeheader()
writer.writerows(properties)
print(f"Saved {len(properties)} properties to {filename}")
Alternatives to Consider
- Zillow's Partner API - Official API for approved partners
- Real Estate APIs - RentSpree, Rentals.com, PadMapper
- MLS Data Providers - RETS feeds, Spark API
- Third-party Services - Scrapfly, Bright Data, WebScraping.AI
- Public Records - County assessor databases
Troubleshooting Common Issues
CAPTCHA Challenges
# Detect CAPTCHA presence
def has_captcha(soup):
captcha_indicators = [
'captcha', 'recaptcha', 'cf-challenge',
'anti-bot', 'verification'
]
page_text = soup.get_text().lower()
return any(indicator in page_text for indicator in captcha_indicators)
Rate Limiting
import time
from random import uniform
def smart_delay():
"""Implement exponential backoff with jitter"""
base_delay = 2
jitter = uniform(0.5, 1.5)
return base_delay * jitter
IP Blocking
- Use rotating proxies
- Implement IP rotation strategies
- Consider residential proxy services
- Monitor response codes (403, 429, 503)
Conclusion
While scraping Zillow is technically possible, it comes with significant legal and technical challenges. The most sustainable approach is to:
- Use official APIs when available
- Respect robots.txt and ToS
- Implement proper rate limiting
- Consider professional scraping services
- Have legal counsel review your use case
Remember that Zillow actively fights against scraping, and your IP may be blocked permanently if detected.