How do I implement database connections in Go scraping projects?
Implementing robust database connections is crucial for Go web scraping projects that need to store and manage scraped data efficiently. This guide covers everything from basic database setup to advanced connection pooling strategies for high-performance scraping applications.
Database Driver Selection
Go offers excellent database drivers for popular databases. Here are the most commonly used options for scraping projects:
PostgreSQL with pgx
import (
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
)
MySQL with go-sql-driver
import (
"database/sql"
_ "github.com/go-sql-driver/mysql"
)
MongoDB with official driver
import (
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
PostgreSQL Connection Implementation
PostgreSQL is often preferred for scraping projects due to its JSON support and reliability. Here's a complete implementation:
Basic Connection Setup
package main
import (
"context"
"fmt"
"log"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
type Database struct {
Pool *pgxpool.Pool
}
func NewDatabase(databaseURL string) (*Database, error) {
config, err := pgxpool.ParseConfig(databaseURL)
if err != nil {
return nil, fmt.Errorf("failed to parse database URL: %w", err)
}
// Configure connection pool
config.MaxConns = 30
config.MinConns = 5
config.MaxConnLifetime = time.Hour
config.MaxConnIdleTime = time.Minute * 30
pool, err := pgxpool.NewWithConfig(context.Background(), config)
if err != nil {
return nil, fmt.Errorf("failed to create connection pool: %w", err)
}
// Test connection
if err := pool.Ping(context.Background()); err != nil {
return nil, fmt.Errorf("failed to ping database: %w", err)
}
return &Database{Pool: pool}, nil
}
func (db *Database) Close() {
db.Pool.Close()
}
Scraped Data Models
type ScrapedProduct struct {
ID int64 `json:"id"`
Name string `json:"name"`
Price float64 `json:"price"`
Description string `json:"description"`
URL string `json:"url"`
ImageURLs []string `json:"image_urls"`
Metadata map[string]interface{} `json:"metadata"`
ScrapedAt time.Time `json:"scraped_at"`
}
type ScrapingJob struct {
ID int64 `json:"id"`
URL string `json:"url"`
Status string `json:"status"`
StartedAt time.Time `json:"started_at"`
CompletedAt *time.Time `json:"completed_at,omitempty"`
ErrorMsg *string `json:"error_msg,omitempty"`
}
Database Operations
func (db *Database) CreateProduct(ctx context.Context, product *ScrapedProduct) error {
query := `
INSERT INTO scraped_products (name, price, description, url, image_urls, metadata, scraped_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING id`
err := db.Pool.QueryRow(ctx, query,
product.Name,
product.Price,
product.Description,
product.URL,
product.ImageURLs,
product.Metadata,
product.ScrapedAt,
).Scan(&product.ID)
if err != nil {
return fmt.Errorf("failed to insert product: %w", err)
}
return nil
}
func (db *Database) BatchInsertProducts(ctx context.Context, products []ScrapedProduct) error {
batch := &pgx.Batch{}
for _, product := range products {
batch.Queue(`
INSERT INTO scraped_products (name, price, description, url, image_urls, metadata, scraped_at)
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
product.Name,
product.Price,
product.Description,
product.URL,
product.ImageURLs,
product.Metadata,
product.ScrapedAt,
)
}
results := db.Pool.SendBatch(ctx, batch)
defer results.Close()
for range products {
_, err := results.Exec()
if err != nil {
return fmt.Errorf("failed to execute batch insert: %w", err)
}
}
return nil
}
MySQL Connection Implementation
For projects using MySQL, here's a robust implementation:
package database
import (
"database/sql"
"fmt"
"time"
_ "github.com/go-sql-driver/mysql"
)
type MySQLDB struct {
DB *sql.DB
}
func NewMySQL(dsn string) (*MySQLDB, error) {
db, err := sql.Open("mysql", dsn)
if err != nil {
return nil, fmt.Errorf("failed to open database: %w", err)
}
// Configure connection pool
db.SetMaxOpenConns(25)
db.SetMaxIdleConns(5)
db.SetConnMaxLifetime(5 * time.Minute)
db.SetConnMaxIdleTime(time.Minute)
if err := db.Ping(); err != nil {
return nil, fmt.Errorf("failed to ping database: %w", err)
}
return &MySQLDB{DB: db}, nil
}
func (m *MySQLDB) InsertScrapedData(product ScrapedProduct) error {
query := `
INSERT INTO scraped_products (name, price, description, url, scraped_at)
VALUES (?, ?, ?, ?, ?)`
_, err := m.DB.Exec(query,
product.Name,
product.Price,
product.Description,
product.URL,
product.ScrapedAt,
)
if err != nil {
return fmt.Errorf("failed to insert product: %w", err)
}
return nil
}
MongoDB Connection Implementation
For projects requiring flexible schema storage, MongoDB is an excellent choice:
package database
import (
"context"
"fmt"
"time"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
"go.mongodb.org/mongo-driver/bson"
)
type MongoDB struct {
Client *mongo.Client
Database *mongo.Database
}
func NewMongoDB(uri, dbName string) (*MongoDB, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
clientOptions := options.Client().ApplyURI(uri)
clientOptions.SetMaxPoolSize(10)
clientOptions.SetMinPoolSize(2)
clientOptions.SetMaxConnIdleTime(30 * time.Second)
client, err := mongo.Connect(ctx, clientOptions)
if err != nil {
return nil, fmt.Errorf("failed to connect to MongoDB: %w", err)
}
if err := client.Ping(ctx, nil); err != nil {
return nil, fmt.Errorf("failed to ping MongoDB: %w", err)
}
return &MongoDB{
Client: client,
Database: client.Database(dbName),
}, nil
}
func (m *MongoDB) InsertScrapedProduct(ctx context.Context, product ScrapedProduct) error {
collection := m.Database.Collection("scraped_products")
_, err := collection.InsertOne(ctx, product)
if err != nil {
return fmt.Errorf("failed to insert product: %w", err)
}
return nil
}
func (m *MongoDB) BulkInsertProducts(ctx context.Context, products []ScrapedProduct) error {
collection := m.Database.Collection("scraped_products")
documents := make([]interface{}, len(products))
for i, product := range products {
documents[i] = product
}
_, err := collection.InsertMany(ctx, documents)
if err != nil {
return fmt.Errorf("failed to bulk insert products: %w", err)
}
return nil
}
Complete Scraping Application Example
Here's a complete example that integrates database connections with a web scraping workflow:
package main
import (
"context"
"fmt"
"log"
"net/http"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/jackc/pgx/v5/pgxpool"
)
type Scraper struct {
db *Database
client *http.Client
}
func NewScraper(db *Database) *Scraper {
client := &http.Client{
Timeout: 30 * time.Second,
}
return &Scraper{
db: db,
client: client,
}
}
func (s *Scraper) ScrapeProductPage(ctx context.Context, url string) error {
resp, err := s.client.Get(url)
if err != nil {
return fmt.Errorf("failed to fetch page: %w", err)
}
defer resp.Body.Close()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return fmt.Errorf("failed to parse HTML: %w", err)
}
var products []ScrapedProduct
doc.Find(".product-item").Each(func(i int, s *goquery.Selection) {
name := strings.TrimSpace(s.Find(".product-name").Text())
priceText := strings.TrimSpace(s.Find(".price").Text())
description := strings.TrimSpace(s.Find(".description").Text())
// Parse price (simplified)
var price float64
fmt.Sscanf(priceText, "$%f", &price)
product := ScrapedProduct{
Name: name,
Price: price,
Description: description,
URL: url,
ScrapedAt: time.Now(),
}
products = append(products, product)
})
// Batch insert for better performance
if len(products) > 0 {
return s.db.BatchInsertProducts(ctx, products)
}
return nil
}
func main() {
databaseURL := "postgres://user:password@localhost/scraping_db?sslmode=disable"
db, err := NewDatabase(databaseURL)
if err != nil {
log.Fatal("Failed to connect to database:", err)
}
defer db.Close()
scraper := NewScraper(db)
urls := []string{
"https://example-store.com/products",
"https://example-store.com/electronics",
}
for _, url := range urls {
if err := scraper.ScrapeProductPage(context.Background(), url); err != nil {
log.Printf("Failed to scrape %s: %v", url, err)
}
}
}
Connection Pooling Best Practices
Proper connection pooling is essential for high-performance scraping applications:
Pool Configuration Guidelines
// For PostgreSQL pgxpool
config.MaxConns = 30 // Maximum connections
config.MinConns = 5 // Minimum idle connections
config.MaxConnLifetime = time.Hour // Maximum connection lifetime
config.MaxConnIdleTime = 30 * time.Minute // Maximum idle time
// For MySQL database/sql
db.SetMaxOpenConns(25) // Maximum open connections
db.SetMaxIdleConns(5) // Maximum idle connections
db.SetConnMaxLifetime(5 * time.Minute) // Connection lifetime
db.SetConnMaxIdleTime(time.Minute) // Connection idle time
Pool Monitoring
func (db *Database) GetPoolStats() {
stats := db.Pool.Stat()
log.Printf("Pool stats - Total: %d, Idle: %d, Used: %d",
stats.TotalConns(),
stats.IdleConns(),
stats.AcquiredConns(),
)
}
Error Handling and Retries
Implement robust error handling for database operations:
func (db *Database) InsertWithRetry(ctx context.Context, product *ScrapedProduct, maxRetries int) error {
var err error
for i := 0; i < maxRetries; i++ {
err = db.CreateProduct(ctx, product)
if err == nil {
return nil
}
// Check if error is retryable
if !isRetryableError(err) {
return err
}
// Exponential backoff
waitTime := time.Duration(i+1) * time.Second
time.Sleep(waitTime)
}
return fmt.Errorf("failed after %d retries: %w", maxRetries, err)
}
func isRetryableError(err error) bool {
// Check for temporary network errors, connection issues, etc.
return strings.Contains(err.Error(), "connection") ||
strings.Contains(err.Error(), "timeout")
}
Database Schema Setup
Create appropriate database tables for your scraping data:
PostgreSQL Schema
CREATE TABLE scraped_products (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
price DECIMAL(10,2),
description TEXT,
url VARCHAR(500) NOT NULL,
image_urls TEXT[],
metadata JSONB,
scraped_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
CREATE INDEX idx_scraped_products_url ON scraped_products(url);
CREATE INDEX idx_scraped_products_scraped_at ON scraped_products(scraped_at);
CREATE INDEX idx_scraped_products_metadata ON scraped_products USING GIN(metadata);
MySQL Schema
CREATE TABLE scraped_products (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) NOT NULL,
price DECIMAL(10,2),
description TEXT,
url VARCHAR(500) NOT NULL,
image_urls JSON,
metadata JSON,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_url ON scraped_products(url);
CREATE INDEX idx_scraped_at ON scraped_products(scraped_at);
Advanced Connection Management
Connection Health Monitoring
func (db *Database) HealthCheck(ctx context.Context) error {
// Simple ping test
if err := db.Pool.Ping(ctx); err != nil {
return fmt.Errorf("database ping failed: %w", err)
}
// More comprehensive check
var result int
err := db.Pool.QueryRow(ctx, "SELECT 1").Scan(&result)
if err != nil {
return fmt.Errorf("database query test failed: %w", err)
}
return nil
}
// Periodic health monitoring
func (db *Database) StartHealthMonitor(ctx context.Context, interval time.Duration) {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := db.HealthCheck(ctx); err != nil {
log.Printf("Database health check failed: %v", err)
}
}
}
}
Transaction Management
func (db *Database) InsertProductWithTx(ctx context.Context, product *ScrapedProduct) error {
tx, err := db.Pool.Begin(ctx)
if err != nil {
return fmt.Errorf("failed to begin transaction: %w", err)
}
defer tx.Rollback(ctx)
// Insert product
err = tx.QueryRow(ctx, `
INSERT INTO scraped_products (name, price, description, url, scraped_at)
VALUES ($1, $2, $3, $4, $5)
RETURNING id`,
product.Name, product.Price, product.Description, product.URL, product.ScrapedAt,
).Scan(&product.ID)
if err != nil {
return fmt.Errorf("failed to insert product: %w", err)
}
// Insert related metadata if needed
if len(product.ImageURLs) > 0 {
for _, imageURL := range product.ImageURLs {
_, err = tx.Exec(ctx, `
INSERT INTO product_images (product_id, image_url)
VALUES ($1, $2)`,
product.ID, imageURL,
)
if err != nil {
return fmt.Errorf("failed to insert image: %w", err)
}
}
}
return tx.Commit(ctx)
}
Performance Optimization Tips
- Use Batch Operations: Always prefer batch inserts over individual inserts for better performance
- Prepare Statements: Use prepared statements for frequently executed queries
- Connection Pooling: Configure appropriate pool sizes based on your workload
- Indexing: Create proper indexes on frequently queried columns
- Transactions: Use transactions for related operations to ensure data consistency
- Monitoring: Implement connection pool monitoring to optimize performance
When building large-scale scraping applications, you might also want to consider implementing concurrent scraping strategies to maximize throughput while maintaining database performance.
For applications requiring JavaScript-rendered content, consider how database connections integrate with headless browser workflows for comprehensive data extraction pipelines.
By following these patterns and best practices, you'll have a robust foundation for storing and managing scraped data in your Go applications, ensuring both performance and reliability as your scraping operations scale.