How do I scrape Zillow data without an API?

⚠️ Important Legal Disclaimer

Before attempting to scrape Zillow, you must understand the legal implications:

  • Zillow's Terms of Service explicitly prohibit automated data collection
  • robots.txt restrictions at https://www.zillow.com/robots.txt may block scraping
  • Legal consequences can include cease-and-desist orders, IP bans, or lawsuits
  • Rate limiting violations can be considered denial-of-service attacks

Always check for official APIs first and consider alternatives like Zillow's Partner API for commercial use.

Why Zillow is Challenging to Scrape

Zillow implements several anti-scraping measures:

  • Dynamic content loading via JavaScript
  • CAPTCHA challenges for suspicious activity
  • IP-based rate limiting and blocking
  • Frequent HTML structure changes
  • Bot detection systems

Method 1: Python with Requests + BeautifulSoup

Installation

pip install requests beautifulsoup4 lxml

Basic Implementation

import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin

class ZillowScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

    def search_properties(self, location, max_pages=3):
        """Search for properties in a specific location"""
        base_url = f"https://www.zillow.com/{location}/"
        properties = []

        for page in range(1, max_pages + 1):
            try:
                # Add random delay between requests
                time.sleep(random.uniform(2, 5))

                url = f"{base_url}{page}_p/" if page > 1 else base_url
                response = self.session.get(url, timeout=10)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    page_properties = self.extract_properties(soup)
                    properties.extend(page_properties)
                    print(f"Page {page}: Found {len(page_properties)} properties")
                else:
                    print(f"Failed to fetch page {page}: {response.status_code}")

            except Exception as e:
                print(f"Error scraping page {page}: {e}")

        return properties

    def extract_properties(self, soup):
        """Extract property data from BeautifulSoup object"""
        properties = []

        # Note: These selectors are examples and may not match current Zillow structure
        property_cards = soup.find_all('article', {'data-test': 'property-card'})

        for card in property_cards:
            try:
                property_data = {
                    'price': self.safe_extract(card, '[data-test="property-card-price"]'),
                    'address': self.safe_extract(card, '[data-test="property-card-addr"]'),
                    'beds': self.safe_extract(card, '[data-test="property-card-beds"]'),
                    'baths': self.safe_extract(card, '[data-test="property-card-baths"]'),
                    'sqft': self.safe_extract(card, '[data-test="property-card-sqft"]'),
                    'property_type': self.safe_extract(card, '[data-test="property-card-type"]'),
                    'link': self.extract_property_link(card)
                }

                # Only add if we have essential data
                if property_data['price'] and property_data['address']:
                    properties.append(property_data)

            except Exception as e:
                print(f"Error extracting property: {e}")
                continue

        return properties

    def safe_extract(self, element, selector):
        """Safely extract text from element"""
        try:
            found = element.select_one(selector)
            return found.get_text(strip=True) if found else None
        except:
            return None

    def extract_property_link(self, card):
        """Extract property detail page link"""
        try:
            link_element = card.find('a', href=True)
            if link_element:
                return urljoin('https://www.zillow.com', link_element['href'])
        except:
            pass
        return None

# Usage example
if __name__ == "__main__":
    scraper = ZillowScraper()

    # Search for properties in Seattle (adjust location format as needed)
    properties = scraper.search_properties("seattle-wa", max_pages=2)

    print(f"\nFound {len(properties)} properties:")
    for prop in properties[:5]:  # Show first 5
        print(f"Price: {prop['price']}")
        print(f"Address: {prop['address']}")
        print(f"Beds/Baths: {prop['beds']}/{prop['baths']}")
        print(f"Link: {prop['link']}")
        print("-" * 50)

Method 2: JavaScript with Puppeteer

Installation

npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

Enhanced Implementation

const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');

// Use stealth plugin to avoid detection
puppeteer.use(StealthPlugin());

class ZillowScraper {
    constructor() {
        this.browser = null;
        this.page = null;
    }

    async init() {
        this.browser = await puppeteer.launch({
            headless: true,
            args: [
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-blink-features=AutomationControlled',
                '--disable-features=VizDisplayCompositor'
            ]
        });

        this.page = await this.browser.newPage();

        // Set realistic viewport
        await this.page.setViewport({ width: 1366, height: 768 });

        // Set user agent
        await this.page.setUserAgent(
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        );

        // Block images and stylesheets to speed up scraping
        await this.page.setRequestInterception(true);
        this.page.on('request', (req) => {
            if (req.resourceType() === 'image' || req.resourceType() === 'stylesheet') {
                req.abort();
            } else {
                req.continue();
            }
        });
    }

    async searchProperties(location, maxPages = 3) {
        if (!this.page) await this.init();

        const allProperties = [];

        for (let page = 1; page <= maxPages; page++) {
            try {
                const url = `https://www.zillow.com/${location}/${page}_p/`;
                console.log(`Scraping page ${page}: ${url}`);

                await this.page.goto(url, { 
                    waitUntil: 'networkidle2',
                    timeout: 30000 
                });

                // Wait for listings to load
                await this.page.waitForSelector('[data-test="property-card"]', { timeout: 10000 });

                // Extract property data
                const properties = await this.page.evaluate(() => {
                    const cards = document.querySelectorAll('[data-test="property-card"]');
                    const results = [];

                    cards.forEach(card => {
                        try {
                            const property = {
                                price: card.querySelector('[data-test="property-card-price"]')?.textContent?.trim(),
                                address: card.querySelector('[data-test="property-card-addr"]')?.textContent?.trim(),
                                beds: card.querySelector('[data-test="property-card-beds"]')?.textContent?.trim(),
                                baths: card.querySelector('[data-test="property-card-baths"]')?.textContent?.trim(),
                                sqft: card.querySelector('[data-test="property-card-sqft"]')?.textContent?.trim(),
                                link: card.querySelector('a')?.href
                            };

                            if (property.price && property.address) {
                                results.push(property);
                            }
                        } catch (e) {
                            console.log('Error extracting property:', e);
                        }
                    });

                    return results;
                });

                allProperties.push(...properties);
                console.log(`Found ${properties.length} properties on page ${page}`);

                // Random delay between pages
                await this.page.waitForTimeout(Math.random() * 3000 + 2000);

            } catch (error) {
                console.error(`Error on page ${page}:`, error.message);
            }
        }

        return allProperties;
    }

    async close() {
        if (this.browser) {
            await this.browser.close();
        }
    }
}

// Usage example
(async () => {
    const scraper = new ZillowScraper();

    try {
        const properties = await scraper.searchProperties('seattle-wa', 2);

        console.log(`\nTotal properties found: ${properties.length}`);
        properties.slice(0, 5).forEach((prop, index) => {
            console.log(`\n${index + 1}. ${prop.address}`);
            console.log(`   Price: ${prop.price}`);
            console.log(`   Specs: ${prop.beds} beds, ${prop.baths} baths, ${prop.sqft}`);
            console.log(`   Link: ${prop.link}`);
        });

    } catch (error) {
        console.error('Scraping failed:', error);
    } finally {
        await scraper.close();
    }
})();

Method 3: Using WebScraping.AI

For a more reliable solution, consider using a web scraping API:

import requests

def scrape_with_webscraping_ai(url, api_key):
    """Use WebScraping.AI to scrape Zillow with better success rates"""

    endpoint = f"https://api.webscraping.ai/html"
    params = {
        'api_key': api_key,
        'url': url,
        'js': 'true',  # Execute JavaScript
        'proxy': 'residential',  # Use residential proxies
        'device': 'desktop',
        'timeout': 15000
    }

    response = requests.get(endpoint, params=params)

    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"API request failed: {response.status_code}")

# Usage
# html_content = scrape_with_webscraping_ai(
#     'https://www.zillow.com/seattle-wa/',
#     'your_api_key_here'
# )

Best Practices and Tips

1. Avoid Detection

  • Rotate User Agents regularly
  • Use residential proxies for different IP addresses
  • Add random delays between requests (2-5 seconds)
  • Limit concurrent requests (max 1-2 simultaneously)

2. Handle Dynamic Content

  • Wait for JavaScript to load content
  • Use browser automation (Puppeteer/Selenium) for JS-heavy pages
  • Monitor network requests to find API endpoints

3. Error Handling

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def create_session_with_retries():
    session = requests.Session()

    # Configure retry strategy
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )

    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    return session

4. Data Storage

import json
import csv
from datetime import datetime

def save_properties(properties, format='json'):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    if format == 'json':
        filename = f"zillow_properties_{timestamp}.json"
        with open(filename, 'w') as f:
            json.dump(properties, f, indent=2)

    elif format == 'csv':
        filename = f"zillow_properties_{timestamp}.csv"
        if properties:
            with open(filename, 'w', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=properties[0].keys())
                writer.writeheader()
                writer.writerows(properties)

    print(f"Saved {len(properties)} properties to {filename}")

Alternatives to Consider

  1. Zillow's Partner API - Official API for approved partners
  2. Real Estate APIs - RentSpree, Rentals.com, PadMapper
  3. MLS Data Providers - RETS feeds, Spark API
  4. Third-party Services - Scrapfly, Bright Data, WebScraping.AI
  5. Public Records - County assessor databases

Troubleshooting Common Issues

CAPTCHA Challenges

# Detect CAPTCHA presence
def has_captcha(soup):
    captcha_indicators = [
        'captcha', 'recaptcha', 'cf-challenge', 
        'anti-bot', 'verification'
    ]
    page_text = soup.get_text().lower()
    return any(indicator in page_text for indicator in captcha_indicators)

Rate Limiting

import time
from random import uniform

def smart_delay():
    """Implement exponential backoff with jitter"""
    base_delay = 2
    jitter = uniform(0.5, 1.5)
    return base_delay * jitter

IP Blocking

  • Use rotating proxies
  • Implement IP rotation strategies
  • Consider residential proxy services
  • Monitor response codes (403, 429, 503)

Conclusion

While scraping Zillow is technically possible, it comes with significant legal and technical challenges. The most sustainable approach is to:

  1. Use official APIs when available
  2. Respect robots.txt and ToS
  3. Implement proper rate limiting
  4. Consider professional scraping services
  5. Have legal counsel review your use case

Remember that Zillow actively fights against scraping, and your IP may be blocked permanently if detected.

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon