How do you integrate Mechanize with databases to store scraped data?
Integrating Mechanize with databases is essential for building robust web scraping applications that can persist and manage large volumes of scraped data. This guide covers multiple database integration approaches, from simple SQLite solutions to enterprise-grade PostgreSQL and MySQL implementations.
Database Options for Mechanize Integration
SQLite Integration
SQLite is perfect for small to medium-scale scraping projects and prototyping:
require 'mechanize'
require 'sqlite3'
class ProductScraper
def initialize
@agent = Mechanize.new
@db = SQLite3::Database.new('scraped_data.db')
setup_database
end
private
def setup_database
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
price DECIMAL(10,2),
description TEXT,
url TEXT UNIQUE,
scraped_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
SQL
end
def save_product(name, price, description, url)
@db.execute(
"INSERT OR REPLACE INTO products (name, price, description, url) VALUES (?, ?, ?, ?)",
[name, price, description, url]
)
end
def scrape_products(base_url)
page = @agent.get(base_url)
page.search('.product').each do |product|
name = product.at('.product-name')&.text&.strip
price = product.at('.price')&.text&.gsub(/[^\d.]/, '')&.to_f
description = product.at('.description')&.text&.strip
url = product.at('a')&.[]('href')
save_product(name, price, description, url) if name && url
end
end
end
PostgreSQL Integration with ActiveRecord
For production applications, PostgreSQL with ActiveRecord provides robust data management:
require 'mechanize'
require 'active_record'
# Database configuration
ActiveRecord::Base.establish_connection(
adapter: 'postgresql',
host: 'localhost',
database: 'scraping_db',
username: 'your_username',
password: 'your_password'
)
# Product model
class Product < ActiveRecord::Base
validates :name, presence: true
validates :url, presence: true, uniqueness: true
scope :recent, -> { where('scraped_at > ?', 1.day.ago) }
scope :by_price_range, ->(min, max) { where(price: min..max) }
end
# Migration
class CreateProducts < ActiveRecord::Migration[7.0]
def change
create_table :products do |t|
t.string :name, null: false
t.decimal :price, precision: 10, scale: 2
t.text :description
t.string :url, null: false
t.string :category
t.json :metadata
t.timestamps
end
add_index :products, :url, unique: true
add_index :products, :category
add_index :products, :price
end
end
class AdvancedProductScraper
def initialize
@agent = Mechanize.new
@agent.user_agent_alias = 'Mac Safari'
@agent.request_headers = {
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' => 'en-US,en;q=0.5'
}
end
def scrape_with_pagination(base_url, max_pages = 10)
current_page = 1
while current_page <= max_pages
begin
page_url = "#{base_url}?page=#{current_page}"
page = @agent.get(page_url)
products = extract_products(page)
break if products.empty?
save_products_batch(products)
current_page += 1
# Rate limiting
sleep(rand(1..3))
rescue Mechanize::ResponseCodeError => e
Rails.logger.error "Failed to fetch page #{current_page}: #{e.message}"
break
end
end
end
private
def extract_products(page)
products = []
page.search('.product-card').each do |card|
product_data = {
name: card.at('.title')&.text&.strip,
price: extract_price(card.at('.price')&.text),
description: card.at('.description')&.text&.strip,
url: card.at('a')&.[]('href'),
category: card.at('.category')&.text&.strip,
metadata: extract_metadata(card)
}
products << product_data if product_data[:name] && product_data[:url]
end
products
end
def save_products_batch(products)
Product.transaction do
products.each do |product_data|
Product.find_or_create_by(url: product_data[:url]) do |product|
product.assign_attributes(product_data)
end
end
end
rescue ActiveRecord::RecordInvalid => e
Rails.logger.error "Failed to save products: #{e.message}"
end
def extract_price(price_text)
return nil unless price_text
price_text.gsub(/[^\d.]/, '').to_f
end
def extract_metadata(card)
{
rating: card.at('.rating')&.text&.strip,
reviews_count: card.at('.reviews')&.text&.gsub(/\D/, '')&.to_i,
availability: card.at('.stock-status')&.text&.strip,
scraped_from: card.at('a')&.[]('href')
}
end
end
MySQL Integration with Sequel
Sequel provides a lightweight alternative to ActiveRecord:
require 'mechanize'
require 'sequel'
require 'mysql2'
# Database connection
DB = Sequel.connect(
adapter: 'mysql2',
host: 'localhost',
database: 'scraping_db',
user: 'your_username',
password: 'your_password'
)
# Create table
DB.create_table? :scraped_articles do
primary_key :id
String :title, null: false
Text :content
String :author
DateTime :published_at
String :source_url, unique: true
DateTime :created_at, default: Sequel::CURRENT_TIMESTAMP
DateTime :updated_at, default: Sequel::CURRENT_TIMESTAMP
end
class ArticleScraper
def initialize
@agent = Mechanize.new
@articles = DB[:scraped_articles]
end
def scrape_news_site(base_url)
page = @agent.get(base_url)
page.search('.article-link').each do |link|
article_url = resolve_url(base_url, link['href'])
scrape_article(article_url) unless article_exists?(article_url)
end
end
private
def scrape_article(url)
article_page = @agent.get(url)
article_data = {
title: article_page.at('h1')&.text&.strip,
content: extract_content(article_page),
author: article_page.at('.author')&.text&.strip,
published_at: parse_date(article_page.at('.publish-date')&.text),
source_url: url
}
save_article(article_data) if article_data[:title]
rescue Mechanize::ResponseCodeError => e
puts "Failed to scrape #{url}: #{e.message}"
end
def save_article(data)
@articles.insert(data)
rescue Sequel::UniqueConstraintViolation
puts "Article already exists: #{data[:source_url]}"
end
def article_exists?(url)
@articles.where(source_url: url).count > 0
end
def extract_content(page)
content_selectors = ['.article-content', '.post-body', '.entry-content']
content_selectors.each do |selector|
content = page.at(selector)
return content.text.strip if content
end
nil
end
def parse_date(date_string)
return nil unless date_string
begin
Date.parse(date_string.strip)
rescue Date::Error
nil
end
end
def resolve_url(base_url, relative_url)
URI.join(base_url, relative_url).to_s
end
end
Advanced Database Patterns
Connection Pooling and Performance
For high-volume scraping operations, implement connection pooling:
require 'connection_pool'
class ScrapingService
def initialize
@db_pool = ConnectionPool.new(size: 10, timeout: 5) do
Sequel.connect(
adapter: 'postgresql',
host: ENV['DB_HOST'],
database: ENV['DB_NAME'],
user: ENV['DB_USER'],
password: ENV['DB_PASSWORD'],
max_connections: 20
)
end
@agent = Mechanize.new
end
def scrape_with_pooling(urls)
threads = []
urls.each_slice(5) do |url_batch|
threads << Thread.new do
@db_pool.with do |db|
scrape_batch(url_batch, db)
end
end
end
threads.each(&:join)
end
private
def scrape_batch(urls, db)
urls.each do |url|
begin
data = scrape_single_page(url)
save_to_database(data, db) if data
sleep(0.5) # Rate limiting
rescue => e
puts "Error scraping #{url}: #{e.message}"
end
end
end
end
Data Validation and Error Handling
Implement robust validation and error handling:
class ValidatedScraper
def initialize
@agent = Mechanize.new
setup_database
end
private
def save_with_validation(data)
# Data validation
errors = validate_data(data)
if errors.empty?
save_to_database(data)
else
log_validation_errors(data, errors)
end
rescue => e
log_database_error(data, e)
end
def validate_data(data)
errors = []
errors << "Name is required" if data[:name].nil? || data[:name].strip.empty?
errors << "URL is required" if data[:url].nil? || data[:url].strip.empty?
errors << "Invalid URL format" unless valid_url?(data[:url])
errors << "Price must be positive" if data[:price] && data[:price] < 0
errors
end
def valid_url?(url)
uri = URI.parse(url)
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
rescue URI::InvalidURIError
false
end
def log_validation_errors(data, errors)
puts "Validation errors for #{data[:url]}: #{errors.join(', ')}"
end
def log_database_error(data, error)
puts "Database error for #{data[:url]}: #{error.message}"
end
end
Best Practices for Database Integration
1. Use Transactions for Batch Operations
def save_batch_with_transaction(products)
ActiveRecord::Base.transaction do
products.each do |product_data|
Product.create!(product_data)
end
end
rescue ActiveRecord::RecordInvalid => e
Rails.logger.error "Batch save failed: #{e.message}"
# Handle rollback
end
2. Implement Proper Indexing
-- Essential indexes for scraping data
CREATE INDEX idx_products_url ON products(url);
CREATE INDEX idx_products_scraped_at ON products(scraped_at);
CREATE INDEX idx_products_category ON products(category);
CREATE INDEX idx_products_price ON products(price);
3. Handle Duplicate Data
def handle_duplicates(url, new_data)
existing = Product.find_by(url: url)
if existing
# Update if data has changed
if data_changed?(existing, new_data)
existing.update!(new_data.merge(updated_at: Time.current))
end
else
Product.create!(new_data)
end
end
def data_changed?(existing, new_data)
%i[name price description].any? do |field|
existing[field] != new_data[field]
end
end
Monitoring and Maintenance
Database Performance Monitoring
class ScrapingMetrics
def self.log_scraping_stats
total_products = Product.count
today_products = Product.where('created_at > ?', 1.day.ago).count
avg_price = Product.average(:price)
puts "Total products: #{total_products}"
puts "Products scraped today: #{today_products}"
puts "Average price: $#{avg_price&.round(2)}"
end
def self.cleanup_old_data
# Remove records older than 90 days
old_records = Product.where('created_at < ?', 90.days.ago)
puts "Cleaning up #{old_records.count} old records"
old_records.delete_all
end
end
Integrating Mechanize with databases provides a solid foundation for scalable web scraping applications. Whether you choose SQLite for simple projects or PostgreSQL for production systems, proper database integration ensures your scraped data is reliably stored and easily accessible for analysis and reporting.
For more advanced scraping scenarios, consider exploring how to handle authentication in Puppeteer when dealing with protected content, or learn about handling browser sessions in Puppeteer for complex multi-step scraping workflows.