Yes, you can use Beautiful Soup combined with the requests
library to follow links and scrape multiple pages. While Beautiful Soup handles HTML parsing, you'll need requests
for making HTTP requests. Here's a comprehensive guide to multi-page scraping.
Installation
pip install requests beautifulsoup4
Basic Multi-Page Scraping
Simple Example: Following Pagination Links
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
def scrape_multiple_pages():
base_url = "https://example.com/articles"
visited_urls = set()
all_data = []
def scrape_page(url):
if url in visited_urls:
return
visited_urls.add(url)
print(f"Scraping: {url}")
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data from current page
articles = soup.find_all('article', class_='post')
for article in articles:
title = article.find('h2').get_text(strip=True)
content = article.find('div', class_='content').get_text(strip=True)
all_data.append({'title': title, 'content': content})
# Find next page link
next_link = soup.find('a', {'rel': 'next'})
if next_link:
next_url = urljoin(url, next_link['href'])
time.sleep(1) # Be respectful
scrape_page(next_url)
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
scrape_page(base_url)
return all_data
Advanced Multi-Page Scraper with Queue Management
import requests
from bs4 import BeautifulSoup
from collections import deque
from urllib.parse import urljoin, urlparse
import time
import logging
class MultiPageScraper:
def __init__(self, base_url, max_pages=50, delay=1):
self.base_url = base_url
self.max_pages = max_pages
self.delay = delay
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.visited_urls = set()
self.url_queue = deque([base_url])
self.scraped_data = []
def is_valid_url(self, url):
"""Check if URL belongs to the same domain"""
base_domain = urlparse(self.base_url).netloc
url_domain = urlparse(url).netloc
return base_domain == url_domain
def extract_links(self, soup, current_url):
"""Extract all valid links from the page"""
links = []
for link in soup.find_all('a', href=True):
full_url = urljoin(current_url, link['href'])
if (self.is_valid_url(full_url) and
full_url not in self.visited_urls and
full_url not in self.url_queue):
links.append(full_url)
return links
def scrape_page(self, url):
"""Scrape a single page and extract data"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract page data (customize based on your needs)
page_data = {
'url': url,
'title': soup.find('title').get_text(strip=True) if soup.find('title') else '',
'headings': [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])],
'paragraphs': [p.get_text(strip=True) for p in soup.find_all('p')[:5]] # First 5 paragraphs
}
# Add new links to queue
new_links = self.extract_links(soup, url)
self.url_queue.extend(new_links)
return page_data
except requests.RequestException as e:
logging.error(f"Error scraping {url}: {e}")
return None
def scrape_all(self):
"""Main scraping method"""
pages_scraped = 0
while self.url_queue and pages_scraped < self.max_pages:
current_url = self.url_queue.popleft()
if current_url in self.visited_urls:
continue
self.visited_urls.add(current_url)
print(f"Scraping page {pages_scraped + 1}: {current_url}")
page_data = self.scrape_page(current_url)
if page_data:
self.scraped_data.append(page_data)
pages_scraped += 1
time.sleep(self.delay)
return self.scraped_data
# Usage
scraper = MultiPageScraper("https://example.com", max_pages=20)
data = scraper.scrape_all()
Specific Use Cases
Following Category Links
def scrape_categories(base_url):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all category links
category_links = soup.find_all('a', class_='category-link')
all_products = []
for link in category_links:
category_url = urljoin(base_url, link['href'])
products = scrape_product_pages(category_url)
all_products.extend(products)
time.sleep(1)
return all_products
def scrape_product_pages(category_url):
products = []
page = 1
while True:
page_url = f"{category_url}?page={page}"
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
product_links = soup.find_all('a', class_='product-link')
if not product_links:
break
for product_link in product_links:
product_url = urljoin(page_url, product_link['href'])
product_data = scrape_single_product(product_url)
if product_data:
products.append(product_data)
page += 1
time.sleep(0.5)
return products
Scraping with Session Management
def scrape_with_session():
session = requests.Session()
# Login if required
login_data = {'username': 'your_username', 'password': 'your_password'}
session.post('https://example.com/login', data=login_data)
urls_to_scrape = [
'https://example.com/protected-page-1',
'https://example.com/protected-page-2',
]
scraped_data = []
for url in urls_to_scrape:
response = session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data
data = extract_page_data(soup)
scraped_data.append(data)
time.sleep(1)
return scraped_data
Best Practices
1. Implement Proper Error Handling
def safe_scrape(url, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.RequestException as e:
print(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
return None
2. Respect Rate Limits
import random
def respectful_delay():
# Random delay between 1-3 seconds
time.sleep(random.uniform(1, 3))
3. Filter Links Intelligently
def should_follow_link(url, base_domain):
parsed_url = urlparse(url)
# Skip non-HTTP URLs
if parsed_url.scheme not in ['http', 'https']:
return False
# Stay within the same domain
if parsed_url.netloc != base_domain:
return False
# Skip common file types
skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.zip', '.exe']
if any(url.lower().endswith(ext) for ext in skip_extensions):
return False
return True
Common Pitfalls to Avoid
- Infinite Loops: Always track visited URLs
- Rate Limiting: Implement delays between requests
- Memory Issues: Limit the number of pages or use generators for large datasets
- Relative URLs: Always convert to absolute URLs using
urljoin()
- JavaScript Content: Beautiful Soup won't see dynamically loaded content
Alternative for JavaScript-Heavy Sites
For sites with heavy JavaScript rendering, consider using Selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
def scrape_js_pages(urls):
driver = webdriver.Chrome()
scraped_data = []
try:
for url in urls:
driver.get(url)
time.sleep(2) # Wait for JS to load
soup = BeautifulSoup(driver.page_source, 'html.parser')
data = extract_data(soup)
scraped_data.append(data)
finally:
driver.quit()
return scraped_data
By combining Beautiful Soup with proper request management, error handling, and respectful scraping practices, you can effectively scrape data across multiple pages while maintaining good performance and reliability.