Table of contents

How do I use Scrapy for web testing?

While Scrapy is primarily designed for web scraping, it can be effectively used for basic web testing scenarios. This includes validating page accessibility, checking response times, testing element presence, and verifying link functionality. Here's a comprehensive guide on using Scrapy for web testing.

Quick Setup

Install Scrapy

pip install scrapy

Create a Testing Project

scrapy startproject web_testing
cd web_testing

Web Testing Scenarios with Scrapy

1. Basic Page Load Testing

Create a spider to test if pages load successfully and measure response times:

import scrapy
import time
from datetime import datetime

class PageLoadTestSpider(scrapy.Spider):
    name = 'page_load_test'

    def start_requests(self):
        test_urls = [
            'https://example.com',
            'https://example.com/about',
            'https://example.com/contact',
        ]

        for url in test_urls:
            yield scrapy.Request(
                url=url, 
                callback=self.parse,
                meta={'start_time': time.time()}
            )

    def parse(self, response):
        end_time = time.time()
        response_time = end_time - response.meta['start_time']

        # Test results
        test_results = {
            'url': response.url,
            'status_code': response.status,
            'response_time': f"{response_time:.2f}s",
            'content_length': len(response.body),
            'timestamp': datetime.now().isoformat()
        }

        # Log results
        if response.status == 200:
            self.logger.info(f"✅ PASS: {response.url} - {response_time:.2f}s")
        else:
            self.logger.error(f"❌ FAIL: {response.url} - Status: {response.status}")

        yield test_results

2. Element Presence Testing

Test if specific elements exist on pages:

import scrapy

class ElementTestSpider(scrapy.Spider):
    name = 'element_test'

    # Define elements to test
    required_elements = {
        'title': 'title',
        'header': 'h1',
        'navigation': 'nav',
        'footer': 'footer',
        'contact_form': 'form#contact'
    }

    def start_requests(self):
        urls = ['https://example.com']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        test_results = {'url': response.url, 'tests': {}}

        for element_name, selector in self.required_elements.items():
            elements = response.css(selector)
            test_results['tests'][element_name] = {
                'selector': selector,
                'found': len(elements) > 0,
                'count': len(elements)
            }

            if elements:
                self.logger.info(f"✅ {element_name}: Found {len(elements)} element(s)")
            else:
                self.logger.error(f"❌ {element_name}: Element not found with selector '{selector}'")

        yield test_results

3. Link Testing Spider

Validate internal and external links:

import scrapy
from urllib.parse import urljoin, urlparse

class LinkTestSpider(scrapy.Spider):
    name = 'link_test'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tested_links = set()
        self.broken_links = []
        self.working_links = []

    def start_requests(self):
        urls = ['https://example.com']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_page)

    def parse_page(self, response):
        # Extract all links
        links = response.css('a::attr(href)').getall()

        for link in links:
            # Convert relative URLs to absolute
            absolute_url = urljoin(response.url, link)

            # Skip if already tested
            if absolute_url in self.tested_links:
                continue

            self.tested_links.add(absolute_url)

            # Test the link
            yield scrapy.Request(
                url=absolute_url,
                callback=self.check_link,
                errback=self.link_error,
                meta={'referring_page': response.url}
            )

    def check_link(self, response):
        result = {
            'url': response.url,
            'status': response.status,
            'referring_page': response.meta['referring_page']
        }

        if response.status == 200:
            self.working_links.append(response.url)
            self.logger.info(f"✅ Link OK: {response.url}")
        else:
            self.broken_links.append(result)
            self.logger.error(f"❌ Link broken: {response.url} (Status: {response.status})")

        yield result

    def link_error(self, failure):
        result = {
            'url': failure.request.url,
            'error': str(failure.value),
            'referring_page': failure.request.meta['referring_page']
        }

        self.broken_links.append(result)
        self.logger.error(f"❌ Link error: {failure.request.url} - {failure.value}")
        yield result

4. Form Testing Spider

Test form presence and basic structure:

import scrapy

class FormTestSpider(scrapy.Spider):
    name = 'form_test'

    def start_requests(self):
        urls = ['https://example.com/contact']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        forms = response.css('form')

        for i, form in enumerate(forms):
            form_data = {
                'form_index': i,
                'action': form.css('::attr(action)').get(),
                'method': form.css('::attr(method)').get() or 'GET',
                'inputs': [],
                'has_submit': False
            }

            # Check form inputs
            inputs = form.css('input, textarea, select')
            for input_elem in inputs:
                input_type = input_elem.css('::attr(type)').get() or 'text'
                input_name = input_elem.css('::attr(name)').get()
                input_required = input_elem.css('::attr(required)').get() is not None

                form_data['inputs'].append({
                    'type': input_type,
                    'name': input_name,
                    'required': input_required
                })

                if input_type in ['submit', 'button']:
                    form_data['has_submit'] = True

            # Check for submit buttons
            submit_buttons = form.css('button[type="submit"], input[type="submit"]')
            if submit_buttons:
                form_data['has_submit'] = True

            # Log results
            self.logger.info(f"Form {i}: {len(form_data['inputs'])} inputs, Submit: {form_data['has_submit']}")

            yield form_data

Running Your Tests

Basic Execution

# Run individual spiders
scrapy crawl page_load_test
scrapy crawl element_test
scrapy crawl link_test
scrapy crawl form_test

Save Results to Files

# Save results as JSON
scrapy crawl page_load_test -o results.json

# Save as CSV
scrapy crawl element_test -o element_tests.csv

Custom Settings for Testing

Create a settings.py configuration for testing:

# Faster testing settings
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 0.5
RANDOMIZE_DOWNLOAD_DELAY = True

# User agent for testing
USER_AGENT = 'Web Testing Bot (+http://www.yourdomain.com/bot)'

# Enable autothrottling
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1

Integration with Testing Frameworks

Using with pytest

import subprocess
import json

def test_page_loads():
    # Run Scrapy spider and capture output
    result = subprocess.run([
        'scrapy', 'crawl', 'page_load_test', 
        '-o', 'test_results.json'
    ], capture_output=True, text=True)

    # Load and verify results
    with open('test_results.json', 'r') as f:
        results = json.load(f)

    for result in results:
        assert result['status_code'] == 200
        assert float(result['response_time'].replace('s', '')) < 5.0

Limitations and Alternatives

When to Use Scrapy for Testing

  • ✅ Basic page load verification
  • ✅ Content and element presence checks
  • ✅ Link validation
  • ✅ Large-scale site testing
  • ✅ Performance monitoring

When to Use Dedicated Testing Tools

  • Selenium: For JavaScript-heavy sites and complex interactions
  • Playwright: Modern browser automation with better performance
  • pytest/unittest: For comprehensive test suites
  • Locust: For load testing and performance testing

Example Integration

# Combine Scrapy with requests for hybrid testing
import scrapy
import requests

class HybridTestSpider(scrapy.Spider):
    name = 'hybrid_test'

    def parse(self, response):
        # Use Scrapy for page parsing
        api_endpoints = response.css('a[href*="/api/"]::attr(href)').getall()

        # Use requests for API testing
        for endpoint in api_endpoints:
            api_response = requests.get(urljoin(response.url, endpoint))
            yield {
                'endpoint': endpoint,
                'status': api_response.status_code,
                'response_time': api_response.elapsed.total_seconds()
            }

Scrapy provides a solid foundation for web testing scenarios, especially when you need to test multiple pages systematically or validate content structure across a website.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon