How do I implement custom duplicate filters in Scrapy?
Scrapy's built-in duplicate filter helps prevent processing the same request multiple times, but sometimes you need more sophisticated filtering logic. Custom duplicate filters allow you to define your own criteria for determining when requests are considered duplicates, giving you fine-grained control over your scraping workflow.
Understanding Scrapy's Default Duplicate Filter
By default, Scrapy uses the RFPDupeFilter
(Request Fingerprint Duplicate Filter) which creates a fingerprint based on the request's URL, method, and body. However, this approach may not suit all use cases, especially when dealing with dynamic URLs or when you need to filter based on custom criteria.
Creating a Basic Custom Duplicate Filter
To implement a custom duplicate filter, you need to create a class that inherits from scrapy.dupefilters.BaseDupeFilter
and implement the required methods:
import hashlib
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class CustomDupeFilter(BaseDupeFilter):
def __init__(self, path=None, debug=False):
self.fingerprints = set()
self.logdupes = debug
def request_seen(self, request):
"""Return True if request is a duplicate, False otherwise"""
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
return False
def request_fingerprint(self, request):
"""Generate a unique fingerprint for the request"""
# Custom logic for generating fingerprints
url = request.url
# Remove query parameters for deduplication
base_url = url.split('?')[0]
return hashlib.sha1(base_url.encode()).hexdigest()
def close(self, reason):
"""Called when the spider closes"""
pass
@classmethod
def from_settings(cls, settings):
"""Create filter instance from settings"""
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(debug=debug)
Advanced Custom Duplicate Filter Examples
1. Content-Based Duplicate Filter
Sometimes you want to filter duplicates based on page content rather than URLs:
import hashlib
from scrapy.dupefilters import BaseDupeFilter
class ContentBasedDupeFilter(BaseDupeFilter):
def __init__(self):
self.content_hashes = set()
def request_seen(self, request):
# Skip filtering for initial requests
if not hasattr(request, 'meta') or 'content_hash' not in request.meta:
return False
content_hash = request.meta['content_hash']
if content_hash in self.content_hashes:
return True
self.content_hashes.add(content_hash)
return False
@classmethod
def from_settings(cls, settings):
return cls()
# Usage in spider
class MySpider(scrapy.Spider):
name = 'content_spider'
def parse(self, response):
# Generate content hash
content = response.css('div.main-content').get() or ''
content_hash = hashlib.md5(content.encode()).hexdigest()
for link in response.css('a::attr(href)').getall():
yield scrapy.Request(
url=response.urljoin(link),
meta={'content_hash': content_hash},
callback=self.parse_item
)
2. Parameter-Ignoring Duplicate Filter
For APIs or dynamic sites where certain parameters should be ignored:
from urllib.parse import urlparse, parse_qs, urlencode
from scrapy.dupefilters import BaseDupeFilter
import hashlib
class ParameterIgnoringDupeFilter(BaseDupeFilter):
def __init__(self, ignored_params=None):
self.fingerprints = set()
self.ignored_params = ignored_params or ['timestamp', 'session_id', '_']
def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
return False
def request_fingerprint(self, request):
parsed_url = urlparse(request.url)
params = parse_qs(parsed_url.query)
# Remove ignored parameters
filtered_params = {
k: v for k, v in params.items()
if k not in self.ignored_params
}
# Reconstruct URL without ignored parameters
clean_query = urlencode(filtered_params, doseq=True)
clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if clean_query:
clean_url += f"?{clean_query}"
return hashlib.sha1(clean_url.encode()).hexdigest()
@classmethod
def from_settings(cls, settings):
ignored_params = settings.getlist('DUPEFILTER_IGNORED_PARAMS', [])
return cls(ignored_params=ignored_params)
3. Time-Based Duplicate Filter
For scenarios where you want to allow re-crawling after a certain time period:
import time
import hashlib
from scrapy.dupefilters import BaseDupeFilter
class TimeBasedDupeFilter(BaseDupeFilter):
def __init__(self, expire_time=3600): # 1 hour default
self.fingerprints = {} # fingerprint -> timestamp
self.expire_time = expire_time
def request_seen(self, request):
fp = self.request_fingerprint(request)
current_time = time.time()
if fp in self.fingerprints:
last_seen = self.fingerprints[fp]
if current_time - last_seen < self.expire_time:
return True
self.fingerprints[fp] = current_time
self._cleanup_expired(current_time)
return False
def _cleanup_expired(self, current_time):
"""Remove expired fingerprints to prevent memory growth"""
expired_fps = [
fp for fp, timestamp in self.fingerprints.items()
if current_time - timestamp >= self.expire_time
]
for fp in expired_fps:
del self.fingerprints[fp]
def request_fingerprint(self, request):
return hashlib.sha1(request.url.encode()).hexdigest()
@classmethod
def from_settings(cls, settings):
expire_time = settings.getint('DUPEFILTER_EXPIRE_TIME', 3600)
return cls(expire_time=expire_time)
Persistent Duplicate Filters
For long-running spiders or when you want to maintain duplicate filtering across spider runs:
import os
import pickle
import hashlib
from scrapy.dupefilters import BaseDupeFilter
class PersistentDupeFilter(BaseDupeFilter):
def __init__(self, file_path='duplicates.pkl'):
self.file_path = file_path
self.fingerprints = self._load_fingerprints()
def _load_fingerprints(self):
"""Load existing fingerprints from file"""
if os.path.exists(self.file_path):
try:
with open(self.file_path, 'rb') as f:
return pickle.load(f)
except (IOError, EOFError):
pass
return set()
def _save_fingerprints(self):
"""Save fingerprints to file"""
with open(self.file_path, 'wb') as f:
pickle.dump(self.fingerprints, f)
def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
return False
def request_fingerprint(self, request):
return hashlib.sha1(request.url.encode()).hexdigest()
def close(self, reason):
"""Save fingerprints when spider closes"""
self._save_fingerprints()
@classmethod
def from_settings(cls, settings):
file_path = settings.get('DUPEFILTER_FILE_PATH', 'duplicates.pkl')
return cls(file_path=file_path)
Configuring Custom Duplicate Filters
Method 1: Settings Configuration
Add your custom filter to settings.py
:
# settings.py
DUPEFILTER_CLASS = 'myproject.filters.CustomDupeFilter'
# Additional settings for custom filters
DUPEFILTER_DEBUG = True
DUPEFILTER_IGNORED_PARAMS = ['timestamp', 'session_id']
DUPEFILTER_EXPIRE_TIME = 7200 # 2 hours
Method 2: Spider-Level Configuration
Configure the filter directly in your spider:
class MySpider(scrapy.Spider):
name = 'my_spider'
custom_settings = {
'DUPEFILTER_CLASS': 'myproject.filters.ContentBasedDupeFilter',
'DUPEFILTER_DEBUG': True,
}
Best Practices and Performance Considerations
Memory Management
For large-scale scraping, implement memory-efficient filters:
from collections import deque
import hashlib
class MemoryEfficientDupeFilter(BaseDupeFilter):
def __init__(self, max_fingerprints=1000000):
self.fingerprints = set()
self.fingerprint_queue = deque()
self.max_fingerprints = max_fingerprints
def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
# Add new fingerprint
self.fingerprints.add(fp)
self.fingerprint_queue.append(fp)
# Remove oldest fingerprints if limit exceeded
while len(self.fingerprints) > self.max_fingerprints:
old_fp = self.fingerprint_queue.popleft()
self.fingerprints.discard(old_fp)
return False
Redis-Based Duplicate Filter
For distributed scraping across multiple machines:
import redis
import hashlib
from scrapy.dupefilters import BaseDupeFilter
class RedisDupeFilter(BaseDupeFilter):
def __init__(self, server, key='scrapy:dupefilter'):
self.server = server
self.key = key
def request_seen(self, request):
fp = self.request_fingerprint(request)
added = self.server.sadd(self.key, fp)
return added == 0 # Returns True if already existed
def request_fingerprint(self, request):
return hashlib.sha1(request.url.encode()).hexdigest()
@classmethod
def from_settings(cls, settings):
server = redis.Redis(
host=settings.get('REDIS_HOST', 'localhost'),
port=settings.getint('REDIS_PORT', 6379),
db=settings.getint('REDIS_DB', 0)
)
key = settings.get('DUPEFILTER_KEY', 'scrapy:dupefilter')
return cls(server, key)
Testing Your Custom Duplicate Filter
Create unit tests to ensure your filter works correctly:
import unittest
from scrapy import Request
from myproject.filters import CustomDupeFilter
class TestCustomDupeFilter(unittest.TestCase):
def setUp(self):
self.filter = CustomDupeFilter()
def test_duplicate_detection(self):
req1 = Request('http://example.com/page?id=1')
req2 = Request('http://example.com/page?id=1')
# First request should not be seen as duplicate
self.assertFalse(self.filter.request_seen(req1))
# Second identical request should be seen as duplicate
self.assertTrue(self.filter.request_seen(req2))
def test_different_requests(self):
req1 = Request('http://example.com/page1')
req2 = Request('http://example.com/page2')
self.assertFalse(self.filter.request_seen(req1))
self.assertFalse(self.filter.request_seen(req2))
Integration with Other Scrapy Components
Custom duplicate filters work seamlessly with other Scrapy features. When implementing complex scraping workflows that involve handling pagination in Scrapy or managing multiple pages, custom duplicate filters ensure you don't process the same content multiple times while navigating through different pages.
Conclusion
Custom duplicate filters in Scrapy provide powerful ways to optimize your scraping efficiency by preventing unnecessary processing of duplicate content. Whether you need to filter based on content similarity, ignore certain URL parameters, or implement time-based expiration, custom filters give you the flexibility to handle complex deduplication scenarios.
Choose the right approach based on your specific requirements: use simple fingerprint-based filters for basic deduplication, content-based filters for similar pages, or persistent filters for long-running operations. Remember to consider memory usage and performance implications, especially when dealing with large-scale scraping projects.
By implementing custom duplicate filters effectively, you can significantly improve your spider's performance while ensuring data quality and reducing server load on target websites.