While Scrapy is primarily designed for web scraping, it can be effectively used for basic web testing scenarios. This includes validating page accessibility, checking response times, testing element presence, and verifying link functionality. Here's a comprehensive guide on using Scrapy for web testing.
Quick Setup
Install Scrapy
pip install scrapy
Create a Testing Project
scrapy startproject web_testing
cd web_testing
Web Testing Scenarios with Scrapy
1. Basic Page Load Testing
Create a spider to test if pages load successfully and measure response times:
import scrapy
import time
from datetime import datetime
class PageLoadTestSpider(scrapy.Spider):
name = 'page_load_test'
def start_requests(self):
test_urls = [
'https://example.com',
'https://example.com/about',
'https://example.com/contact',
]
for url in test_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta={'start_time': time.time()}
)
def parse(self, response):
end_time = time.time()
response_time = end_time - response.meta['start_time']
# Test results
test_results = {
'url': response.url,
'status_code': response.status,
'response_time': f"{response_time:.2f}s",
'content_length': len(response.body),
'timestamp': datetime.now().isoformat()
}
# Log results
if response.status == 200:
self.logger.info(f"✅ PASS: {response.url} - {response_time:.2f}s")
else:
self.logger.error(f"❌ FAIL: {response.url} - Status: {response.status}")
yield test_results
2. Element Presence Testing
Test if specific elements exist on pages:
import scrapy
class ElementTestSpider(scrapy.Spider):
name = 'element_test'
# Define elements to test
required_elements = {
'title': 'title',
'header': 'h1',
'navigation': 'nav',
'footer': 'footer',
'contact_form': 'form#contact'
}
def start_requests(self):
urls = ['https://example.com']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
test_results = {'url': response.url, 'tests': {}}
for element_name, selector in self.required_elements.items():
elements = response.css(selector)
test_results['tests'][element_name] = {
'selector': selector,
'found': len(elements) > 0,
'count': len(elements)
}
if elements:
self.logger.info(f"✅ {element_name}: Found {len(elements)} element(s)")
else:
self.logger.error(f"❌ {element_name}: Element not found with selector '{selector}'")
yield test_results
3. Link Testing Spider
Validate internal and external links:
import scrapy
from urllib.parse import urljoin, urlparse
class LinkTestSpider(scrapy.Spider):
name = 'link_test'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.tested_links = set()
self.broken_links = []
self.working_links = []
def start_requests(self):
urls = ['https://example.com']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_page)
def parse_page(self, response):
# Extract all links
links = response.css('a::attr(href)').getall()
for link in links:
# Convert relative URLs to absolute
absolute_url = urljoin(response.url, link)
# Skip if already tested
if absolute_url in self.tested_links:
continue
self.tested_links.add(absolute_url)
# Test the link
yield scrapy.Request(
url=absolute_url,
callback=self.check_link,
errback=self.link_error,
meta={'referring_page': response.url}
)
def check_link(self, response):
result = {
'url': response.url,
'status': response.status,
'referring_page': response.meta['referring_page']
}
if response.status == 200:
self.working_links.append(response.url)
self.logger.info(f"✅ Link OK: {response.url}")
else:
self.broken_links.append(result)
self.logger.error(f"❌ Link broken: {response.url} (Status: {response.status})")
yield result
def link_error(self, failure):
result = {
'url': failure.request.url,
'error': str(failure.value),
'referring_page': failure.request.meta['referring_page']
}
self.broken_links.append(result)
self.logger.error(f"❌ Link error: {failure.request.url} - {failure.value}")
yield result
4. Form Testing Spider
Test form presence and basic structure:
import scrapy
class FormTestSpider(scrapy.Spider):
name = 'form_test'
def start_requests(self):
urls = ['https://example.com/contact']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
forms = response.css('form')
for i, form in enumerate(forms):
form_data = {
'form_index': i,
'action': form.css('::attr(action)').get(),
'method': form.css('::attr(method)').get() or 'GET',
'inputs': [],
'has_submit': False
}
# Check form inputs
inputs = form.css('input, textarea, select')
for input_elem in inputs:
input_type = input_elem.css('::attr(type)').get() or 'text'
input_name = input_elem.css('::attr(name)').get()
input_required = input_elem.css('::attr(required)').get() is not None
form_data['inputs'].append({
'type': input_type,
'name': input_name,
'required': input_required
})
if input_type in ['submit', 'button']:
form_data['has_submit'] = True
# Check for submit buttons
submit_buttons = form.css('button[type="submit"], input[type="submit"]')
if submit_buttons:
form_data['has_submit'] = True
# Log results
self.logger.info(f"Form {i}: {len(form_data['inputs'])} inputs, Submit: {form_data['has_submit']}")
yield form_data
Running Your Tests
Basic Execution
# Run individual spiders
scrapy crawl page_load_test
scrapy crawl element_test
scrapy crawl link_test
scrapy crawl form_test
Save Results to Files
# Save results as JSON
scrapy crawl page_load_test -o results.json
# Save as CSV
scrapy crawl element_test -o element_tests.csv
Custom Settings for Testing
Create a settings.py
configuration for testing:
# Faster testing settings
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True
# User agent for testing
USER_AGENT = 'Web Testing Bot (+http://www.yourdomain.com/bot)'
# Enable autothrottling
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
Integration with Testing Frameworks
Using with pytest
import subprocess
import json
def test_page_loads():
# Run Scrapy spider and capture output
result = subprocess.run([
'scrapy', 'crawl', 'page_load_test',
'-o', 'test_results.json'
], capture_output=True, text=True)
# Load and verify results
with open('test_results.json', 'r') as f:
results = json.load(f)
for result in results:
assert result['status_code'] == 200
assert float(result['response_time'].replace('s', '')) < 5.0
Limitations and Alternatives
When to Use Scrapy for Testing
- ✅ Basic page load verification
- ✅ Content and element presence checks
- ✅ Link validation
- ✅ Large-scale site testing
- ✅ Performance monitoring
When to Use Dedicated Testing Tools
- Selenium: For JavaScript-heavy sites and complex interactions
- Playwright: Modern browser automation with better performance
- pytest/unittest: For comprehensive test suites
- Locust: For load testing and performance testing
Example Integration
# Combine Scrapy with requests for hybrid testing
import scrapy
import requests
class HybridTestSpider(scrapy.Spider):
name = 'hybrid_test'
def parse(self, response):
# Use Scrapy for page parsing
api_endpoints = response.css('a[href*="/api/"]::attr(href)').getall()
# Use requests for API testing
for endpoint in api_endpoints:
api_response = requests.get(urljoin(response.url, endpoint))
yield {
'endpoint': endpoint,
'status': api_response.status_code,
'response_time': api_response.elapsed.total_seconds()
}
Scrapy provides a solid foundation for web testing scenarios, especially when you need to test multiple pages systematically or validate content structure across a website.