How do I use Scrapy with Selenium WebDriver?
Integrating Scrapy with Selenium WebDriver is a powerful approach for scraping JavaScript-heavy websites that require browser automation. This combination leverages Scrapy's robust framework for data extraction and pipeline management while using Selenium's capabilities to handle dynamic content, AJAX requests, and complex user interactions.
Why Combine Scrapy with Selenium?
Scrapy excels at fast, efficient web scraping but struggles with JavaScript-rendered content. Selenium WebDriver can execute JavaScript and interact with pages like a real browser, making it perfect for:
- Single Page Applications (SPAs)
- Sites with infinite scroll
- Content loaded via AJAX
- Pages requiring user interaction (clicks, form submissions)
- Sites with complex authentication flows
Installation and Setup
First, install the required packages:
pip install scrapy selenium scrapy-selenium
Install a WebDriver (Chrome example):
# On macOS with Homebrew
brew install chromedriver
# Or download manually from https://chromedriver.chromium.org/
Method 1: Using scrapy-selenium Middleware
The scrapy-selenium
package provides seamless integration through middleware.
Configuration
Add to your settings.py
:
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800
}
# Selenium WebDriver configuration
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = '/usr/local/bin/chromedriver'
SELENIUM_DRIVER_ARGUMENTS = [
'--headless',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--window-size=1920,1080'
]
Spider Implementation
# spiders/selenium_spider.py
import scrapy
from scrapy_selenium import SeleniumRequest
class SeleniumSpider(scrapy.Spider):
name = 'selenium_spider'
def start_requests(self):
urls = ['https://example.com/dynamic-content']
for url in urls:
yield SeleniumRequest(
url=url,
callback=self.parse,
wait_time=3, # Wait 3 seconds for page to load
wait_until=EC.presence_of_element_located((By.CLASS_NAME, 'content'))
)
def parse(self, response):
# Extract data using Scrapy selectors
titles = response.css('.title::text').getall()
descriptions = response.css('.description::text').getall()
for title, desc in zip(titles, descriptions):
yield {
'title': title.strip(),
'description': desc.strip()
}
# Handle pagination with Selenium
next_page = response.css('.next-page::attr(href)').get()
if next_page:
yield SeleniumRequest(
url=response.urljoin(next_page),
callback=self.parse,
wait_time=3
)
Method 2: Custom Selenium Integration
For more control, you can create a custom middleware or integrate Selenium directly in your spider.
Custom Middleware
# middlewares.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from scrapy.http import HtmlResponse
import time
class SeleniumMiddleware:
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.set_window_size(1920, 1080)
def process_request(self, request, spider):
# Only process requests marked for Selenium
if not hasattr(request, 'meta') or not request.meta.get('selenium'):
return None
self.driver.get(request.url)
# Wait for specific elements if specified
wait_for = request.meta.get('wait_for')
if wait_for:
wait = WebDriverWait(self.driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, wait_for)))
# Execute custom JavaScript if provided
js_script = request.meta.get('js_script')
if js_script:
self.driver.execute_script(js_script)
# Wait additional time if specified
wait_time = request.meta.get('wait_time', 0)
if wait_time:
time.sleep(wait_time)
# Return Scrapy response
body = self.driver.page_source.encode('utf-8')
return HtmlResponse(
url=self.driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self, spider):
self.driver.quit()
Spider Using Custom Middleware
# spiders/custom_selenium_spider.py
import scrapy
class CustomSeleniumSpider(scrapy.Spider):
name = 'custom_selenium_spider'
def start_requests(self):
urls = ['https://example.com/spa']
for url in urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
'selenium': True,
'wait_for': '.dynamic-content',
'wait_time': 2,
'js_script': 'window.scrollTo(0, document.body.scrollHeight);'
}
)
def parse(self, response):
# Handle infinite scroll
products = response.css('.product-item')
for product in products:
yield {
'name': product.css('.name::text').get(),
'price': product.css('.price::text').get(),
'url': response.urljoin(product.css('a::attr(href)').get())
}
# Load more content by scrolling
has_more = response.css('.load-more').get()
if has_more:
yield scrapy.Request(
url=response.url,
callback=self.parse_more,
meta={
'selenium': True,
'js_script': '''
window.scrollTo(0, document.body.scrollHeight);
document.querySelector('.load-more').click();
''',
'wait_time': 3
}
)
def parse_more(self, response):
# Process additional loaded content
new_products = response.css('.product-item:not(.processed)')
for product in new_products:
yield {
'name': product.css('.name::text').get(),
'price': product.css('.price::text').get(),
'url': response.urljoin(product.css('a::attr(href)').get())
}
Handling Complex Scenarios
Login and Authentication
def handle_login(self, response):
"""Handle login process with Selenium"""
return scrapy.Request(
url='https://example.com/login',
callback=self.after_login,
meta={
'selenium': True,
'js_script': '''
document.querySelector('#username').value = 'your_username';
document.querySelector('#password').value = 'your_password';
document.querySelector('#login-form').submit();
''',
'wait_for': '.dashboard',
'wait_time': 3
}
)
def after_login(self, response):
"""Process pages after successful login"""
# Continue scraping authenticated content
pass
Handling AJAX Pagination
def parse_ajax_pagination(self, response):
"""Handle AJAX-based pagination"""
items = response.css('.item')
for item in items:
yield self.extract_item_data(item)
# Check if more pages exist
page_num = response.meta.get('page_num', 1)
if len(items) >= 20: # Assuming 20 items per page
yield scrapy.Request(
url=response.url,
callback=self.parse_ajax_pagination,
meta={
'selenium': True,
'page_num': page_num + 1,
'js_script': f'''
// Simulate AJAX pagination
fetch('/api/items?page={page_num + 1}')
.then(response => response.json())
.then(data => {{
// Update DOM with new items
document.querySelector('#items-container').innerHTML = data.html;
}});
''',
'wait_time': 2
}
)
Performance Optimization
Browser Pool Management
# middlewares.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import queue
import threading
class BrowserPoolMiddleware:
def __init__(self, pool_size=3):
self.pool_size = pool_size
self.browser_pool = queue.Queue()
self.lock = threading.Lock()
# Initialize browser pool
for _ in range(pool_size):
browser = self.create_browser()
self.browser_pool.put(browser)
def create_browser(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
return webdriver.Chrome(options=chrome_options)
def get_browser(self):
return self.browser_pool.get()
def return_browser(self, browser):
self.browser_pool.put(browser)
def process_request(self, request, spider):
if not request.meta.get('selenium'):
return None
browser = self.get_browser()
try:
browser.get(request.url)
# Process request...
body = browser.page_source.encode('utf-8')
return HtmlResponse(url=browser.current_url, body=body, encoding='utf-8', request=request)
finally:
self.return_browser(browser)
Selective Selenium Usage
class SmartSeleniumSpider(scrapy.Spider):
def needs_selenium(self, url):
"""Determine if URL needs Selenium based on patterns"""
selenium_patterns = [
'/spa/',
'/dynamic/',
'javascript-heavy'
]
return any(pattern in url for pattern in selenium_patterns)
def start_requests(self):
for url in self.start_urls:
meta = {}
if self.needs_selenium(url):
meta['selenium'] = True
meta['wait_time'] = 2
yield scrapy.Request(url=url, callback=self.parse, meta=meta)
Best Practices
- Use Selenium selectively: Only use Selenium for pages that actually need it to maintain performance
- Implement proper waits: Use explicit waits instead of time.sleep() when possible
- Handle browser lifecycle: Properly close browsers to prevent memory leaks
- Configure timeouts: Set reasonable timeouts for page loads and element waits
- Use headless mode: Run browsers in headless mode for better performance in production
- Implement error handling: Handle Selenium exceptions gracefully
- Pool browsers: Use browser pools for concurrent processing
Troubleshooting Common Issues
ChromeDriver Issues
# Update ChromeDriver
brew upgrade chromedriver
# Check Chrome version compatibility
google-chrome --version
chromedriver --version
Memory Management
# settings.py
# Limit concurrent requests when using Selenium
CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 2
# Set download delay to reduce load
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
Conclusion
Combining Scrapy with Selenium WebDriver provides a powerful solution for scraping modern web applications. While this approach requires more resources than pure Scrapy, it enables you to handle JavaScript-heavy sites that would otherwise be impossible to scrape effectively.
For simpler JavaScript interactions, consider using headless browsers like Puppeteer for handling dynamic content or exploring AJAX request handling techniques before implementing the full Scrapy-Selenium solution.
The key is to use Selenium judiciously—only when necessary—and implement proper performance optimizations to maintain reasonable scraping speeds while handling complex web applications.