When scraping Idealista, there's no universal frequency that guarantees you won't get blocked. The key is implementing responsible scraping practices that respect the website's resources and terms of service.
Legal and Ethical Prerequisites
Before scraping Idealista, always:
- Review Terms of Service: Check if Idealista's terms explicitly prohibit scraping
- Examine robots.txt: Visit
https://www.idealista.com/robots.txt
to see crawling restrictions - Consider official APIs: Check if Idealista offers official APIs for data access
Recommended Scraping Frequencies
Based on best practices for real estate websites:
Conservative Approach (Recommended)
- Request interval: 5-15 seconds between requests
- Daily limit: 500-1000 requests maximum
- Session length: 30-60 minutes with breaks
Moderate Approach
- Request interval: 2-5 seconds between requests
- Daily limit: 1000-2000 requests maximum
- Session length: 1-2 hours with breaks
Aggressive Approach (High Risk)
- Request interval: 1-2 seconds between requests
- Daily limit: 2000+ requests
- Risk: Very likely to trigger anti-bot measures
Anti-Blocking Strategies
1. Implement Rate Limiting
import time
import random
def rate_limited_request(session, url, min_delay=2, max_delay=8):
"""Make a request with random delay to mimic human behavior"""
delay = random.uniform(min_delay, max_delay)
time.sleep(delay)
return session.get(url)
2. Rotate User Agents
import requests
from itertools import cycle
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
user_agent_cycle = cycle(USER_AGENTS)
def get_headers():
return {
'User-Agent': next(user_agent_cycle),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
3. Handle Rate Limiting Responses
def handle_response(response, session, url, max_retries=3):
"""Handle different response codes appropriately"""
if response.status_code == 200:
return response
elif response.status_code == 429: # Too Many Requests
retry_after = int(response.headers.get('Retry-After', 60))
print(f"Rate limited. Waiting {retry_after} seconds...")
time.sleep(retry_after)
return session.get(url)
elif response.status_code == 403: # Forbidden
print("Access forbidden. Consider using proxies or changing approach")
return None
elif response.status_code == 503: # Service Unavailable
print("Service unavailable. Waiting before retry...")
time.sleep(30)
return session.get(url)
else:
print(f"Unexpected status code: {response.status_code}")
return response
Complete Example with Best Practices
import requests
import time
import random
from urllib.parse import urljoin, urlparse
import logging
class IdealitaScraper:
def __init__(self, base_url="https://www.idealista.com"):
self.base_url = base_url
self.session = requests.Session()
self.request_count = 0
self.start_time = time.time()
# Configure logging
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
def get_headers(self):
"""Rotate through realistic headers"""
headers = {
'User-Agent': random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
]),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
return headers
def make_request(self, url, max_retries=3):
"""Make a rate-limited request with retry logic"""
for attempt in range(max_retries):
try:
# Implement rate limiting
if self.request_count > 0:
delay = random.uniform(3, 8) # 3-8 second delay
time.sleep(delay)
response = self.session.get(url, headers=self.get_headers(), timeout=10)
self.request_count += 1
# Log request details
self.logger.info(f"Request {self.request_count}: {response.status_code} - {url}")
if response.status_code == 200:
return response
elif response.status_code == 429:
retry_after = int(response.headers.get('Retry-After', 60))
self.logger.warning(f"Rate limited. Waiting {retry_after} seconds...")
time.sleep(retry_after)
continue
elif response.status_code in [403, 503]:
self.logger.warning(f"Status {response.status_code}. Waiting before retry...")
time.sleep(30)
continue
else:
self.logger.error(f"Unexpected status: {response.status_code}")
return response
except requests.exceptions.RequestException as e:
self.logger.error(f"Request failed: {e}")
if attempt < max_retries - 1:
time.sleep(10)
continue
else:
return None
return None
def scrape_listings(self, search_urls, max_requests_per_hour=100):
"""Scrape multiple listing pages with hourly limits"""
results = []
for url in search_urls:
# Check hourly rate limit
elapsed_time = time.time() - self.start_time
if elapsed_time < 3600 and self.request_count >= max_requests_per_hour:
wait_time = 3600 - elapsed_time
self.logger.info(f"Hourly limit reached. Waiting {wait_time/60:.1f} minutes...")
time.sleep(wait_time)
self.start_time = time.time()
self.request_count = 0
response = self.make_request(url)
if response and response.status_code == 200:
# Process the response here
results.append({
'url': url,
'content': response.text,
'timestamp': time.time()
})
else:
self.logger.error(f"Failed to scrape: {url}")
return results
# Usage example
if __name__ == "__main__":
scraper = IdealitaScraper()
# Example URLs (replace with actual search URLs)
urls = [
"https://www.idealista.com/en/areas/venta-viviendas/madrid/",
"https://www.idealista.com/en/areas/venta-viviendas/barcelona/"
]
results = scraper.scrape_listings(urls, max_requests_per_hour=50)
print(f"Scraped {len(results)} pages successfully")
Additional Considerations
Session Management
- Use persistent sessions to maintain cookies
- Implement session rotation for long-running scrapers
- Clear sessions periodically to avoid detection
Proxy Usage
import requests
from itertools import cycle
# Rotate through multiple proxies
PROXIES = [
{'http': 'http://proxy1:port', 'https': 'https://proxy1:port'},
{'http': 'http://proxy2:port', 'https': 'https://proxy2:port'},
]
proxy_cycle = cycle(PROXIES)
def get_proxy():
return next(proxy_cycle)
Monitoring and Alerts
- Track success rates and response times
- Set up alerts for blocking incidents
- Monitor for CAPTCHA challenges
Warning Signs of Blocking
Watch for these indicators: - 429 status codes: Rate limiting in effect - 403 status codes: Access forbidden - CAPTCHA challenges: Human verification required - Unusual redirects: Potential bot detection - Empty responses: Content blocking mechanisms
Alternative Approaches
If scraping becomes problematic: 1. Official APIs: Check for Idealista's developer APIs 2. Third-party services: Consider real estate data providers 3. Manual collection: For small datasets 4. Web scraping services: Professional scraping providers
Remember that Idealista continuously updates its anti-scraping measures. Stay informed about changes and adapt your scraping strategy accordingly. Always prioritize ethical scraping practices and consider the legal implications in your jurisdiction.