Can Colly integrate with databases for storing scraped data?
Yes, Colly can seamlessly integrate with various databases to store scraped data. As a powerful Go web scraping framework, Colly provides flexible callback mechanisms that make it easy to persist extracted data to databases like MySQL, PostgreSQL, MongoDB, Redis, and more. This integration is essential for building production-ready web scraping applications that need to store, process, and analyze large volumes of scraped data.
Understanding Colly's Callback System
Colly's database integration relies on its event-driven callback system. The most commonly used callbacks for database operations are:
OnHTML()
: Triggered when HTML elements are foundOnResponse()
: Triggered when a response is receivedOnScraped()
: Triggered after all processing is complete
These callbacks provide the perfect entry points for database operations during the scraping process.
Integrating with SQL Databases
MySQL Integration
Here's how to integrate Colly with MySQL using the popular go-sql-driver/mysql
package:
package main
import (
"database/sql"
"fmt"
"log"
"github.com/gocolly/colly/v2"
_ "github.com/go-sql-driver/mysql"
)
type Product struct {
Name string
Price string
URL string
}
func main() {
// Initialize database connection
db, err := sql.Open("mysql", "user:password@tcp(localhost:3306)/scraping_db")
if err != nil {
log.Fatal(err)
}
defer db.Close()
// Create table if not exists
createTable := `
CREATE TABLE IF NOT EXISTS products (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255),
price VARCHAR(100),
url VARCHAR(500),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)`
if _, err := db.Exec(createTable); err != nil {
log.Fatal(err)
}
// Initialize Colly
c := colly.NewCollector()
// Set up HTML callback to extract data and store in database
c.OnHTML(".product", func(e *colly.HTMLElement) {
product := Product{
Name: e.ChildText(".product-name"),
Price: e.ChildText(".price"),
URL: e.Request.URL.String(),
}
// Insert into database
insertSQL := "INSERT INTO products (name, price, url) VALUES (?, ?, ?)"
if _, err := db.Exec(insertSQL, product.Name, product.Price, product.URL); err != nil {
log.Printf("Error inserting product: %v", err)
} else {
fmt.Printf("Stored product: %s\n", product.Name)
}
})
// Start scraping
c.Visit("https://example-shop.com/products")
}
PostgreSQL Integration
PostgreSQL integration follows a similar pattern using the lib/pq
driver:
package main
import (
"database/sql"
"log"
"github.com/gocolly/colly/v2"
_ "github.com/lib/pq"
)
func main() {
// Connect to PostgreSQL
db, err := sql.Open("postgres", "user=username dbname=scraping_db sslmode=disable")
if err != nil {
log.Fatal(err)
}
defer db.Close()
// Create table
createTable := `
CREATE TABLE IF NOT EXISTS articles (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
author VARCHAR(255),
published_date TIMESTAMP,
url TEXT UNIQUE,
created_at TIMESTAMP DEFAULT NOW()
)`
db.Exec(createTable)
c := colly.NewCollector()
c.OnHTML("article", func(e *colly.HTMLElement) {
title := e.ChildText("h1")
content := e.ChildText(".content")
author := e.ChildText(".author")
// Use UPSERT to avoid duplicates
upsertSQL := `
INSERT INTO articles (title, content, author, url)
VALUES ($1, $2, $3, $4)
ON CONFLICT (url) DO UPDATE SET
title = EXCLUDED.title,
content = EXCLUDED.content,
author = EXCLUDED.author
`
if _, err := db.Exec(upsertSQL, title, content, author, e.Request.URL.String()); err != nil {
log.Printf("Database error: %v", err)
}
})
c.Visit("https://news-site.com")
}
Integrating with NoSQL Databases
MongoDB Integration
MongoDB integration using the official Go driver provides flexible document storage:
package main
import (
"context"
"log"
"time"
"github.com/gocolly/colly/v2"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type ScrapedData struct {
Title string `bson:"title"`
Description string `bson:"description"`
URL string `bson:"url"`
ScrapedAt time.Time `bson:"scraped_at"`
Tags []string `bson:"tags"`
}
func main() {
// Connect to MongoDB
client, err := mongo.Connect(context.TODO(), options.Client().ApplyURI("mongodb://localhost:27017"))
if err != nil {
log.Fatal(err)
}
defer client.Disconnect(context.TODO())
collection := client.Database("scraping_db").Collection("pages")
c := colly.NewCollector()
c.OnHTML("html", func(e *colly.HTMLElement) {
data := ScrapedData{
Title: e.ChildText("title"),
Description: e.ChildAttr("meta[name=description]", "content"),
URL: e.Request.URL.String(),
ScrapedAt: time.Now(),
Tags: []string{}, // Extract tags as needed
}
// Extract tags
e.ForEach("meta[name=keywords]", func(_ int, el *colly.HTMLElement) {
if keywords := el.Attr("content"); keywords != "" {
// Split keywords and add to tags
data.Tags = append(data.Tags, keywords)
}
})
// Insert document
if _, err := collection.InsertOne(context.TODO(), data); err != nil {
log.Printf("MongoDB insert error: %v", err)
}
})
c.Visit("https://example.com")
}
Advanced Database Integration Patterns
Batch Processing for Performance
For high-volume scraping, implement batch processing to improve database performance:
type BatchProcessor struct {
db *sql.DB
batch []Product
batchSize int
}
func NewBatchProcessor(db *sql.DB, size int) *BatchProcessor {
return &BatchProcessor{
db: db,
batch: make([]Product, 0, size),
batchSize: size,
}
}
func (bp *BatchProcessor) Add(product Product) error {
bp.batch = append(bp.batch, product)
if len(bp.batch) >= bp.batchSize {
return bp.Flush()
}
return nil
}
func (bp *BatchProcessor) Flush() error {
if len(bp.batch) == 0 {
return nil
}
// Prepare batch insert
valueStrings := make([]string, 0, len(bp.batch))
valueArgs := make([]interface{}, 0, len(bp.batch)*3)
for _, product := range bp.batch {
valueStrings = append(valueStrings, "(?, ?, ?)")
valueArgs = append(valueArgs, product.Name, product.Price, product.URL)
}
stmt := fmt.Sprintf("INSERT INTO products (name, price, url) VALUES %s",
strings.Join(valueStrings, ","))
if _, err := bp.db.Exec(stmt, valueArgs...); err != nil {
return err
}
// Clear batch
bp.batch = bp.batch[:0]
return nil
}
Connection Pooling and Error Handling
Implement robust connection pooling and error handling for production environments:
func setupDatabase() *sql.DB {
db, err := sql.Open("mysql", "user:password@tcp(localhost:3306)/scraping_db")
if err != nil {
log.Fatal(err)
}
// Configure connection pool
db.SetMaxOpenConns(25)
db.SetMaxIdleConns(25)
db.SetConnMaxLifetime(5 * time.Minute)
// Test connection
if err := db.Ping(); err != nil {
log.Fatal("Database connection failed:", err)
}
return db
}
func handleDatabaseError(err error, data interface{}) {
log.Printf("Database operation failed: %v", err)
// Implement retry logic or dead letter queue
// Store failed data for later processing
saveToRetryQueue(data)
}
Database Schema Design Best Practices
Optimized Table Structure
Design your database schema with scraping requirements in mind:
-- Optimized products table
CREATE TABLE products (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
name VARCHAR(500) NOT NULL,
price DECIMAL(10,2),
currency VARCHAR(3),
url VARCHAR(1000) NOT NULL,
image_url VARCHAR(1000),
category VARCHAR(255),
brand VARCHAR(255),
availability ENUM('in_stock', 'out_of_stock', 'discontinued'),
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
INDEX idx_url (url(255)),
INDEX idx_category (category),
INDEX idx_brand (brand),
INDEX idx_scraped_at (scraped_at)
);
-- Price history tracking
CREATE TABLE price_history (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
product_id BIGINT,
price DECIMAL(10,2),
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (product_id) REFERENCES products(id),
INDEX idx_product_date (product_id, recorded_at)
);
Monitoring and Maintenance
Database Health Monitoring
Implement monitoring to track database performance during scraping operations:
type DBMonitor struct {
db *sql.DB
}
func (m *DBMonitor) LogStats() {
stats := m.db.Stats()
log.Printf("DB Stats - Open: %d, InUse: %d, Idle: %d",
stats.OpenConnections, stats.InUse, stats.Idle)
}
func (m *DBMonitor) CheckHealth() error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
return m.db.PingContext(ctx)
}
Conclusion
Colly's flexible architecture makes database integration straightforward and powerful. Whether you're using SQL databases like MySQL and PostgreSQL or NoSQL solutions like MongoDB, Colly's callback system provides the perfect hooks for data persistence. By implementing proper error handling, connection pooling, and batch processing, you can build robust, scalable web scraping applications that efficiently store and manage large volumes of scraped data.
The key to successful database integration with Colly lies in understanding your data requirements, choosing the right database technology, and implementing proper performance optimization techniques. With these patterns and best practices, you can build production-ready scraping systems that scale with your data collection needs.