Can I use Beautiful Soup to follow links and scrape multiple pages?

Yes, you can use Beautiful Soup combined with the requests library to follow links and scrape multiple pages. While Beautiful Soup handles HTML parsing, you'll need requests for making HTTP requests. Here's a comprehensive guide to multi-page scraping.

Installation

pip install requests beautifulsoup4

Basic Multi-Page Scraping

Simple Example: Following Pagination Links

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse

def scrape_multiple_pages():
    base_url = "https://example.com/articles"
    visited_urls = set()
    all_data = []

    def scrape_page(url):
        if url in visited_urls:
            return

        visited_urls.add(url)
        print(f"Scraping: {url}")

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract data from current page
            articles = soup.find_all('article', class_='post')
            for article in articles:
                title = article.find('h2').get_text(strip=True)
                content = article.find('div', class_='content').get_text(strip=True)
                all_data.append({'title': title, 'content': content})

            # Find next page link
            next_link = soup.find('a', {'rel': 'next'})
            if next_link:
                next_url = urljoin(url, next_link['href'])
                time.sleep(1)  # Be respectful
                scrape_page(next_url)

        except requests.RequestException as e:
            print(f"Error scraping {url}: {e}")

    scrape_page(base_url)
    return all_data

Advanced Multi-Page Scraper with Queue Management

import requests
from bs4 import BeautifulSoup
from collections import deque
from urllib.parse import urljoin, urlparse
import time
import logging

class MultiPageScraper:
    def __init__(self, base_url, max_pages=50, delay=1):
        self.base_url = base_url
        self.max_pages = max_pages
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.visited_urls = set()
        self.url_queue = deque([base_url])
        self.scraped_data = []

    def is_valid_url(self, url):
        """Check if URL belongs to the same domain"""
        base_domain = urlparse(self.base_url).netloc
        url_domain = urlparse(url).netloc
        return base_domain == url_domain

    def extract_links(self, soup, current_url):
        """Extract all valid links from the page"""
        links = []
        for link in soup.find_all('a', href=True):
            full_url = urljoin(current_url, link['href'])
            if (self.is_valid_url(full_url) and 
                full_url not in self.visited_urls and
                full_url not in self.url_queue):
                links.append(full_url)
        return links

    def scrape_page(self, url):
        """Scrape a single page and extract data"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract page data (customize based on your needs)
            page_data = {
                'url': url,
                'title': soup.find('title').get_text(strip=True) if soup.find('title') else '',
                'headings': [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])],
                'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')[:5]]  # First 5 paragraphs
            }

            # Add new links to queue
            new_links = self.extract_links(soup, url)
            self.url_queue.extend(new_links)

            return page_data

        except requests.RequestException as e:
            logging.error(f"Error scraping {url}: {e}")
            return None

    def scrape_all(self):
        """Main scraping method"""
        pages_scraped = 0

        while self.url_queue and pages_scraped < self.max_pages:
            current_url = self.url_queue.popleft()

            if current_url in self.visited_urls:
                continue

            self.visited_urls.add(current_url)
            print(f"Scraping page {pages_scraped + 1}: {current_url}")

            page_data = self.scrape_page(current_url)
            if page_data:
                self.scraped_data.append(page_data)
                pages_scraped += 1

            time.sleep(self.delay)

        return self.scraped_data

# Usage
scraper = MultiPageScraper("https://example.com", max_pages=20)
data = scraper.scrape_all()

Specific Use Cases

Following Category Links

def scrape_categories(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all category links
    category_links = soup.find_all('a', class_='category-link')

    all_products = []
    for link in category_links:
        category_url = urljoin(base_url, link['href'])
        products = scrape_product_pages(category_url)
        all_products.extend(products)
        time.sleep(1)

    return all_products

def scrape_product_pages(category_url):
    products = []
    page = 1

    while True:
        page_url = f"{category_url}?page={page}"
        response = requests.get(page_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        product_links = soup.find_all('a', class_='product-link')
        if not product_links:
            break

        for product_link in product_links:
            product_url = urljoin(page_url, product_link['href'])
            product_data = scrape_single_product(product_url)
            if product_data:
                products.append(product_data)

        page += 1
        time.sleep(0.5)

    return products

Scraping with Session Management

def scrape_with_session():
    session = requests.Session()

    # Login if required
    login_data = {'username': 'your_username', 'password': 'your_password'}
    session.post('https://example.com/login', data=login_data)

    urls_to_scrape = [
        'https://example.com/protected-page-1',
        'https://example.com/protected-page-2',
    ]

    scraped_data = []
    for url in urls_to_scrape:
        response = session.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract data
        data = extract_page_data(soup)
        scraped_data.append(data)
        time.sleep(1)

    return scraped_data

Best Practices

1. Implement Proper Error Handling

def safe_scrape(url, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed for {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
    return None

2. Respect Rate Limits

import random

def respectful_delay():
    # Random delay between 1-3 seconds
    time.sleep(random.uniform(1, 3))

3. Filter Links Intelligently

def should_follow_link(url, base_domain):
    parsed_url = urlparse(url)

    # Skip non-HTTP URLs
    if parsed_url.scheme not in ['http', 'https']:
        return False

    # Stay within the same domain
    if parsed_url.netloc != base_domain:
        return False

    # Skip common file types
    skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.zip', '.exe']
    if any(url.lower().endswith(ext) for ext in skip_extensions):
        return False

    return True

Common Pitfalls to Avoid

  1. Infinite Loops: Always track visited URLs
  2. Rate Limiting: Implement delays between requests
  3. Memory Issues: Limit the number of pages or use generators for large datasets
  4. Relative URLs: Always convert to absolute URLs using urljoin()
  5. JavaScript Content: Beautiful Soup won't see dynamically loaded content

Alternative for JavaScript-Heavy Sites

For sites with heavy JavaScript rendering, consider using Selenium:

from selenium import webdriver
from bs4 import BeautifulSoup

def scrape_js_pages(urls):
    driver = webdriver.Chrome()
    scraped_data = []

    try:
        for url in urls:
            driver.get(url)
            time.sleep(2)  # Wait for JS to load

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            data = extract_data(soup)
            scraped_data.append(data)
    finally:
        driver.quit()

    return scraped_data

By combining Beautiful Soup with proper request management, error handling, and respectful scraping practices, you can effectively scrape data across multiple pages while maintaining good performance and reliability.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon