Scrapy excels at HTML content but doesn't directly process PDF files. However, you can use Scrapy to download PDFs and then extract data using specialized libraries. This guide covers the complete workflow with practical examples.
Method 1: Using Scrapy's Built-in File Pipeline
Step 1: Configure Scrapy Settings
# settings.py
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 300,
}
FILES_STORE = 'downloads'
FILES_URLS_FIELD = 'file_urls'
FILES_RESULT_FIELD = 'files'
# Optional: File filtering
FILES_EXPIRES = 90 # Delete files after 90 days
MEDIA_ALLOW_REDIRECTS = True
Step 2: Create Your Spider
# pdf_spider.py
import scrapy
import os
from urllib.parse import urljoin, urlparse
class PdfSpider(scrapy.Spider):
name = 'pdf_spider'
start_urls = ['https://example.com/documents']
def parse(self, response):
# Find all PDF links
pdf_links = response.css('a[href$=".pdf"]::attr(href)').getall()
for link in pdf_links:
pdf_url = urljoin(response.url, link)
yield {
'file_urls': [pdf_url],
'pdf_title': response.css('a[href="{}"]::text'.format(link)).get(),
'source_page': response.url
}
# Follow pagination
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
Method 2: Custom PDF Processing Pipeline
Create a custom pipeline to process PDFs immediately after download:
# pipelines.py
import os
import pymupdf # PyMuPDF (fitz)
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
class PdfProcessingPipeline(FilesPipeline):
def item_completed(self, results, item, info):
# Process downloaded files
file_paths = [x['path'] for ok, x in results if ok]
if not file_paths:
raise DropItem("No files downloaded")
# Extract text from each PDF
extracted_texts = []
for file_path in file_paths:
full_path = os.path.join(self.store.basedir, file_path)
text = self.extract_pdf_text(full_path)
extracted_texts.append(text)
item['extracted_texts'] = extracted_texts
item['file_paths'] = file_paths
return item
def extract_pdf_text(self, file_path):
try:
doc = pymupdf.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text.strip()
except Exception as e:
self.logger.error(f"Error extracting text from {file_path}: {e}")
return ""
Update your settings to use the custom pipeline:
# settings.py
ITEM_PIPELINES = {
'myproject.pipelines.PdfProcessingPipeline': 300,
}
PDF Text Extraction Libraries Comparison
1. PyMuPDF (Recommended)
import pymupdf
def extract_with_pymupdf(file_path):
doc = pymupdf.open(file_path)
text = ""
for page_num in range(doc.page_count):
page = doc[page_num]
text += f"Page {page_num + 1}:\n"
text += page.get_text()
text += "\n" + "="*50 + "\n"
doc.close()
return text
# Extract tables
def extract_tables_pymupdf(file_path):
doc = pymupdf.open(file_path)
tables = []
for page in doc:
page_tables = page.find_tables()
for table in page_tables:
tables.append(table.extract())
doc.close()
return tables
2. PyPDF2/pypdf (Updated)
import pypdf
def extract_with_pypdf(file_path):
with open(file_path, 'rb') as file:
reader = pypdf.PdfReader(file)
text = ""
for page_num, page in enumerate(reader.pages):
text += f"Page {page_num + 1}:\n"
text += page.extract_text()
text += "\n" + "="*50 + "\n"
return text
# Extract metadata
def get_pdf_metadata(file_path):
with open(file_path, 'rb') as file:
reader = pypdf.PdfReader(file)
metadata = reader.metadata
return {
'title': metadata.get('/Title', ''),
'author': metadata.get('/Author', ''),
'pages': len(reader.pages)
}
3. pdfplumber (For Complex Layouts)
import pdfplumber
def extract_with_pdfplumber(file_path):
extracted_data = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_data = {
'text': page.extract_text(),
'tables': page.extract_tables()
}
extracted_data.append(page_data)
return extracted_data
Complete Example: Advanced PDF Spider
# advanced_pdf_spider.py
import scrapy
import os
import pymupdf
from urllib.parse import urljoin
import hashlib
class AdvancedPdfSpider(scrapy.Spider):
name = 'advanced_pdf'
custom_settings = {
'ITEM_PIPELINES': {
'myproject.pipelines.PdfProcessingPipeline': 300,
},
'FILES_STORE': 'pdf_downloads',
'DOWNLOAD_DELAY': 1, # Be respectful
}
def start_requests(self):
urls = [
'https://example.com/reports',
'https://example.com/documents',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# Extract PDF links with context
for link in response.css('a[href$=".pdf"]'):
pdf_url = urljoin(response.url, link.attrib['href'])
# Get surrounding context
title = link.css('::text').get() or link.attrib.get('title', '')
description = link.xpath('./following-sibling::p/text()').get()
yield {
'file_urls': [pdf_url],
'title': title.strip(),
'description': description,
'source_url': response.url,
'pdf_url': pdf_url,
}
# Follow links to more pages
for next_page in response.css('a[href*="page"]::attr(href)').getall():
yield response.follow(next_page, self.parse)
# Item definition
class PdfItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
source_url = scrapy.Field()
pdf_url = scrapy.Field()
extracted_text = scrapy.Field()
page_count = scrapy.Field()
file_size = scrapy.Field()
Error Handling and Best Practices
1. Handle Corrupted PDFs
def safe_pdf_extraction(file_path):
try:
doc = pymupdf.open(file_path)
if doc.is_encrypted:
return "PDF is encrypted"
text = ""
for page in doc:
try:
text += page.get_text()
except Exception as e:
text += f"[Error on page: {e}]\n"
doc.close()
return text
except Exception as e:
return f"Failed to process PDF: {e}"
2. Memory Management for Large PDFs
def extract_large_pdf(file_path, max_pages=100):
doc = pymupdf.open(file_path)
text = ""
# Process in chunks
total_pages = min(doc.page_count, max_pages)
for page_num in range(total_pages):
page = doc[page_num]
text += page.get_text()
# Clear page from memory
page = None
doc.close()
return text
3. Spider with Retry Logic
class RobustPdfSpider(scrapy.Spider):
name = 'robust_pdf'
def parse_pdf_item(self, response):
# Validate PDF content
if len(response.body) < 1000: # Too small to be a real PDF
self.logger.warning(f"Suspicious PDF size: {len(response.body)} bytes")
return
if not response.body.startswith(b'%PDF'):
self.logger.error(f"Invalid PDF format: {response.url}")
return
yield {
'file_urls': [response.url],
'file_size': len(response.body),
'content_type': response.headers.get('Content-Type', b'').decode(),
}
Performance Tips
- Use concurrent downloads: Set
CONCURRENT_REQUESTS
appropriately - Filter file sizes: Add
FILES_EXPIRES
and size limits - Cache responses: Enable
HTTPCACHE_ENABLED = True
- Respect robots.txt: Keep
ROBOTSTXT_OBEY = True
- Monitor memory usage: Process PDFs in batches for large datasets
Installation Requirements
pip install scrapy pymupdf pypdf pdfplumber
# or
pip install scrapy PyMuPDF PyPDF2 pdfplumber
PDF scraping requires balancing extraction accuracy with performance. Choose the right library based on your PDF complexity and processing requirements.