How do I scrape PDFs with Scrapy?

Scrapy excels at HTML content but doesn't directly process PDF files. However, you can use Scrapy to download PDFs and then extract data using specialized libraries. This guide covers the complete workflow with practical examples.

Method 1: Using Scrapy's Built-in File Pipeline

Step 1: Configure Scrapy Settings

# settings.py
ITEM_PIPELINES = {
    'scrapy.pipelines.files.FilesPipeline': 300,
}

FILES_STORE = 'downloads'
FILES_URLS_FIELD = 'file_urls'
FILES_RESULT_FIELD = 'files'

# Optional: File filtering
FILES_EXPIRES = 90  # Delete files after 90 days
MEDIA_ALLOW_REDIRECTS = True

Step 2: Create Your Spider

# pdf_spider.py
import scrapy
import os
from urllib.parse import urljoin, urlparse

class PdfSpider(scrapy.Spider):
    name = 'pdf_spider'
    start_urls = ['https://example.com/documents']

    def parse(self, response):
        # Find all PDF links
        pdf_links = response.css('a[href$=".pdf"]::attr(href)').getall()

        for link in pdf_links:
            pdf_url = urljoin(response.url, link)
            yield {
                'file_urls': [pdf_url],
                'pdf_title': response.css('a[href="{}"]::text'.format(link)).get(),
                'source_page': response.url
            }

        # Follow pagination
        next_page = response.css('a.next::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

Method 2: Custom PDF Processing Pipeline

Create a custom pipeline to process PDFs immediately after download:

# pipelines.py
import os
import pymupdf  # PyMuPDF (fitz)
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem

class PdfProcessingPipeline(FilesPipeline):

    def item_completed(self, results, item, info):
        # Process downloaded files
        file_paths = [x['path'] for ok, x in results if ok]

        if not file_paths:
            raise DropItem("No files downloaded")

        # Extract text from each PDF
        extracted_texts = []
        for file_path in file_paths:
            full_path = os.path.join(self.store.basedir, file_path)
            text = self.extract_pdf_text(full_path)
            extracted_texts.append(text)

        item['extracted_texts'] = extracted_texts
        item['file_paths'] = file_paths
        return item

    def extract_pdf_text(self, file_path):
        try:
            doc = pymupdf.open(file_path)
            text = ""
            for page in doc:
                text += page.get_text()
            doc.close()
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from {file_path}: {e}")
            return ""

Update your settings to use the custom pipeline:

# settings.py
ITEM_PIPELINES = {
    'myproject.pipelines.PdfProcessingPipeline': 300,
}

PDF Text Extraction Libraries Comparison

1. PyMuPDF (Recommended)

import pymupdf

def extract_with_pymupdf(file_path):
    doc = pymupdf.open(file_path)
    text = ""

    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += f"Page {page_num + 1}:\n"
        text += page.get_text()
        text += "\n" + "="*50 + "\n"

    doc.close()
    return text

# Extract tables
def extract_tables_pymupdf(file_path):
    doc = pymupdf.open(file_path)
    tables = []

    for page in doc:
        page_tables = page.find_tables()
        for table in page_tables:
            tables.append(table.extract())

    doc.close()
    return tables

2. PyPDF2/pypdf (Updated)

import pypdf

def extract_with_pypdf(file_path):
    with open(file_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        text = ""

        for page_num, page in enumerate(reader.pages):
            text += f"Page {page_num + 1}:\n"
            text += page.extract_text()
            text += "\n" + "="*50 + "\n"

        return text

# Extract metadata
def get_pdf_metadata(file_path):
    with open(file_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        metadata = reader.metadata
        return {
            'title': metadata.get('/Title', ''),
            'author': metadata.get('/Author', ''),
            'pages': len(reader.pages)
        }

3. pdfplumber (For Complex Layouts)

import pdfplumber

def extract_with_pdfplumber(file_path):
    extracted_data = []

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_data = {
                'text': page.extract_text(),
                'tables': page.extract_tables()
            }
            extracted_data.append(page_data)

    return extracted_data

Complete Example: Advanced PDF Spider

# advanced_pdf_spider.py
import scrapy
import os
import pymupdf
from urllib.parse import urljoin
import hashlib

class AdvancedPdfSpider(scrapy.Spider):
    name = 'advanced_pdf'

    custom_settings = {
        'ITEM_PIPELINES': {
            'myproject.pipelines.PdfProcessingPipeline': 300,
        },
        'FILES_STORE': 'pdf_downloads',
        'DOWNLOAD_DELAY': 1,  # Be respectful
    }

    def start_requests(self):
        urls = [
            'https://example.com/reports',
            'https://example.com/documents',
        ]
        for url in urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        # Extract PDF links with context
        for link in response.css('a[href$=".pdf"]'):
            pdf_url = urljoin(response.url, link.attrib['href'])

            # Get surrounding context
            title = link.css('::text').get() or link.attrib.get('title', '')
            description = link.xpath('./following-sibling::p/text()').get()

            yield {
                'file_urls': [pdf_url],
                'title': title.strip(),
                'description': description,
                'source_url': response.url,
                'pdf_url': pdf_url,
            }

        # Follow links to more pages
        for next_page in response.css('a[href*="page"]::attr(href)').getall():
            yield response.follow(next_page, self.parse)

# Item definition
class PdfItem(scrapy.Item):
    file_urls = scrapy.Field()
    files = scrapy.Field()
    title = scrapy.Field()
    description = scrapy.Field()
    source_url = scrapy.Field()
    pdf_url = scrapy.Field()
    extracted_text = scrapy.Field()
    page_count = scrapy.Field()
    file_size = scrapy.Field()

Error Handling and Best Practices

1. Handle Corrupted PDFs

def safe_pdf_extraction(file_path):
    try:
        doc = pymupdf.open(file_path)
        if doc.is_encrypted:
            return "PDF is encrypted"

        text = ""
        for page in doc:
            try:
                text += page.get_text()
            except Exception as e:
                text += f"[Error on page: {e}]\n"

        doc.close()
        return text
    except Exception as e:
        return f"Failed to process PDF: {e}"

2. Memory Management for Large PDFs

def extract_large_pdf(file_path, max_pages=100):
    doc = pymupdf.open(file_path)
    text = ""

    # Process in chunks
    total_pages = min(doc.page_count, max_pages)

    for page_num in range(total_pages):
        page = doc[page_num]
        text += page.get_text()

        # Clear page from memory
        page = None

    doc.close()
    return text

3. Spider with Retry Logic

class RobustPdfSpider(scrapy.Spider):
    name = 'robust_pdf'

    def parse_pdf_item(self, response):
        # Validate PDF content
        if len(response.body) < 1000:  # Too small to be a real PDF
            self.logger.warning(f"Suspicious PDF size: {len(response.body)} bytes")
            return

        if not response.body.startswith(b'%PDF'):
            self.logger.error(f"Invalid PDF format: {response.url}")
            return

        yield {
            'file_urls': [response.url],
            'file_size': len(response.body),
            'content_type': response.headers.get('Content-Type', b'').decode(),
        }

Performance Tips

  1. Use concurrent downloads: Set CONCURRENT_REQUESTS appropriately
  2. Filter file sizes: Add FILES_EXPIRES and size limits
  3. Cache responses: Enable HTTPCACHE_ENABLED = True
  4. Respect robots.txt: Keep ROBOTSTXT_OBEY = True
  5. Monitor memory usage: Process PDFs in batches for large datasets

Installation Requirements

pip install scrapy pymupdf pypdf pdfplumber
# or
pip install scrapy PyMuPDF PyPDF2 pdfplumber

PDF scraping requires balancing extraction accuracy with performance. Choose the right library based on your PDF complexity and processing requirements.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon