What is the Best Way to Store Scraped Data in Go Applications?
Storing scraped data efficiently is crucial for Go web scraping applications. The choice of storage method depends on factors like data volume, query requirements, performance needs, and persistence requirements. This guide explores various storage options with practical implementations.
Storage Options Overview
1. Relational Databases (PostgreSQL, MySQL)
Relational databases are ideal for structured data with complex relationships and when you need ACID compliance.
PostgreSQL Implementation
package main
import (
"database/sql"
"encoding/json"
"fmt"
"log"
"time"
_ "github.com/lib/pq"
)
type ScrapedData struct {
ID int `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
Metadata string `json:"metadata"` // JSON field
ScrapedAt time.Time `json:"scraped_at"`
}
type PostgreSQLStorage struct {
db *sql.DB
}
func NewPostgreSQLStorage(connectionString string) (*PostgreSQLStorage, error) {
db, err := sql.Open("postgres", connectionString)
if err != nil {
return nil, err
}
storage := &PostgreSQLStorage{db: db}
if err := storage.createTables(); err != nil {
return nil, err
}
return storage, nil
}
func (p *PostgreSQLStorage) createTables() error {
query := `
CREATE TABLE IF NOT EXISTS scraped_data (
id SERIAL PRIMARY KEY,
url VARCHAR(2048) NOT NULL,
title TEXT,
content TEXT,
metadata JSONB,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(url)
);
CREATE INDEX IF NOT EXISTS idx_scraped_at ON scraped_data(scraped_at);
CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url);
`
_, err := p.db.Exec(query)
return err
}
func (p *PostgreSQLStorage) Store(data ScrapedData) error {
query := `
INSERT INTO scraped_data (url, title, content, metadata, scraped_at)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (url) DO UPDATE SET
title = EXCLUDED.title,
content = EXCLUDED.content,
metadata = EXCLUDED.metadata,
scraped_at = EXCLUDED.scraped_at
`
metadataJSON, _ := json.Marshal(data.Metadata)
_, err := p.db.Exec(query, data.URL, data.Title, data.Content,
string(metadataJSON), data.ScrapedAt)
return err
}
func (p *PostgreSQLStorage) GetByURL(url string) (*ScrapedData, error) {
query := `
SELECT id, url, title, content, metadata, scraped_at
FROM scraped_data WHERE url = $1
`
var data ScrapedData
var metadataJSON string
err := p.db.QueryRow(query, url).Scan(
&data.ID, &data.URL, &data.Title, &data.Content,
&metadataJSON, &data.ScrapedAt,
)
if err != nil {
return nil, err
}
json.Unmarshal([]byte(metadataJSON), &data.Metadata)
return &data, nil
}
2. NoSQL Databases (MongoDB)
MongoDB is excellent for flexible schema requirements and rapid development.
package main
import (
"context"
"time"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type MongoStorage struct {
client *mongo.Client
collection *mongo.Collection
}
type MongoDocument struct {
ID interface{} `bson:"_id,omitempty"`
URL string `bson:"url"`
Title string `bson:"title"`
Content string `bson:"content"`
Tags []string `bson:"tags"`
Metadata interface{} `bson:"metadata"`
ScrapedAt time.Time `bson:"scraped_at"`
}
func NewMongoStorage(uri, database, collection string) (*MongoStorage, error) {
client, err := mongo.Connect(context.TODO(), options.Client().ApplyURI(uri))
if err != nil {
return nil, err
}
coll := client.Database(database).Collection(collection)
// Create indexes
indexModel := mongo.IndexModel{
Keys: bson.D{{"url", 1}},
Options: options.Index().SetUnique(true),
}
_, err = coll.Indexes().CreateOne(context.TODO(), indexModel)
if err != nil {
return nil, err
}
return &MongoStorage{
client: client,
collection: coll,
}, nil
}
func (m *MongoStorage) Store(doc MongoDocument) error {
filter := bson.D{{"url", doc.URL}}
update := bson.D{{"$set", doc}}
opts := options.Update().SetUpsert(true)
_, err := m.collection.UpdateOne(context.TODO(), filter, update, opts)
return err
}
func (m *MongoStorage) FindByTags(tags []string) ([]MongoDocument, error) {
filter := bson.D{{"tags", bson.D{{"$in", tags}}}}
cursor, err := m.collection.Find(context.TODO(), filter)
if err != nil {
return nil, err
}
defer cursor.Close(context.TODO())
var results []MongoDocument
if err = cursor.All(context.TODO(), &results); err != nil {
return nil, err
}
return results, nil
}
func (m *MongoStorage) BulkInsert(docs []MongoDocument) error {
var operations []mongo.WriteModel
for _, doc := range docs {
operation := mongo.NewUpdateOneModel()
operation.SetFilter(bson.D{{"url", doc.URL}})
operation.SetUpdate(bson.D{{"$set", doc}})
operation.SetUpsert(true)
operations = append(operations, operation)
}
_, err := m.collection.BulkWrite(context.TODO(), operations)
return err
}
3. File-Based Storage
For simple applications or when you need human-readable output, file-based storage can be effective.
JSON File Storage
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"sync"
"time"
)
type FileStorage struct {
basePath string
mutex sync.RWMutex
}
type FileData struct {
URL string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
Metadata map[string]interface{} `json:"metadata"`
ScrapedAt time.Time `json:"scraped_at"`
}
func NewFileStorage(basePath string) *FileStorage {
os.MkdirAll(basePath, 0755)
return &FileStorage{basePath: basePath}
}
func (f *FileStorage) Store(data FileData) error {
f.mutex.Lock()
defer f.mutex.Unlock()
// Create filename based on URL hash or timestamp
filename := fmt.Sprintf("%d_%s.json",
time.Now().Unix(),
sanitizeFilename(data.URL))
filepath := filepath.Join(f.basePath, filename)
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
return err
}
return ioutil.WriteFile(filepath, jsonData, 0644)
}
func (f *FileStorage) StoreCSV(data []map[string]string, filename string) error {
f.mutex.Lock()
defer f.mutex.Unlock()
if len(data) == 0 {
return nil
}
filepath := filepath.Join(f.basePath, filename)
file, err := os.Create(filepath)
if err != nil {
return err
}
defer file.Close()
// Write CSV header
var headers []string
for key := range data[0] {
headers = append(headers, key)
}
for i, header := range headers {
if i > 0 {
file.WriteString(",")
}
file.WriteString(header)
}
file.WriteString("\n")
// Write CSV data
for _, row := range data {
for i, header := range headers {
if i > 0 {
file.WriteString(",")
}
file.WriteString(fmt.Sprintf(`"%s"`, row[header]))
}
file.WriteString("\n")
}
return nil
}
func sanitizeFilename(url string) string {
// Simple sanitization - replace problematic characters
result := ""
for _, char := range url {
if (char >= 'a' && char <= 'z') ||
(char >= 'A' && char <= 'Z') ||
(char >= '0' && char <= '9') {
result += string(char)
} else {
result += "_"
}
}
if len(result) > 50 {
result = result[:50]
}
return result
}
4. In-Memory Storage with Redis
Redis is perfect for caching and temporary storage of scraped data.
package main
import (
"encoding/json"
"time"
"github.com/go-redis/redis/v8"
"golang.org/x/net/context"
)
type RedisStorage struct {
client *redis.Client
}
func NewRedisStorage(addr, password string, db int) *RedisStorage {
rdb := redis.NewClient(&redis.Options{
Addr: addr,
Password: password,
DB: db,
})
return &RedisStorage{client: rdb}
}
func (r *RedisStorage) StoreWithExpiration(key string, data interface{}, expiration time.Duration) error {
jsonData, err := json.Marshal(data)
if err != nil {
return err
}
return r.client.Set(context.Background(), key, jsonData, expiration).Err()
}
func (r *RedisStorage) Get(key string, dest interface{}) error {
val, err := r.client.Get(context.Background(), key).Result()
if err != nil {
return err
}
return json.Unmarshal([]byte(val), dest)
}
func (r *RedisStorage) AddToSet(setKey, member string) error {
return r.client.SAdd(context.Background(), setKey, member).Err()
}
func (r *RedisStorage) GetSetMembers(setKey string) ([]string, error) {
return r.client.SMembers(context.Background(), setKey).Result()
}
// Useful for deduplication
func (r *RedisStorage) IsURLProcessed(url string) (bool, error) {
exists, err := r.client.Exists(context.Background(), "processed:"+url).Result()
return exists > 0, err
}
func (r *RedisStorage) MarkURLProcessed(url string, expiration time.Duration) error {
return r.client.Set(context.Background(), "processed:"+url, "1", expiration).Err()
}
Advanced Storage Patterns
1. Hybrid Storage Strategy
Combine multiple storage types for optimal performance:
type HybridStorage struct {
redis *RedisStorage
postgres *PostgreSQLStorage
files *FileStorage
}
func (h *HybridStorage) Store(data ScrapedData) error {
// Store in Redis for quick access
h.redis.StoreWithExpiration(
fmt.Sprintf("cache:%s", data.URL),
data,
24*time.Hour,
)
// Store in PostgreSQL for persistence
if err := h.postgres.Store(data); err != nil {
return err
}
// Store critical data in files as backup
if data.Title != "" {
fileData := FileData{
URL: data.URL,
Title: data.Title,
Content: data.Content,
ScrapedAt: data.ScrapedAt,
}
h.files.Store(fileData)
}
return nil
}
2. Batch Processing for Performance
type BatchProcessor struct {
storage *PostgreSQLStorage
batch []ScrapedData
batchSize int
mutex sync.Mutex
}
func NewBatchProcessor(storage *PostgreSQLStorage, batchSize int) *BatchProcessor {
return &BatchProcessor{
storage: storage,
batchSize: batchSize,
batch: make([]ScrapedData, 0, batchSize),
}
}
func (b *BatchProcessor) Add(data ScrapedData) error {
b.mutex.Lock()
defer b.mutex.Unlock()
b.batch = append(b.batch, data)
if len(b.batch) >= b.batchSize {
return b.flush()
}
return nil
}
func (b *BatchProcessor) flush() error {
if len(b.batch) == 0 {
return nil
}
tx, err := b.storage.db.Begin()
if err != nil {
return err
}
defer tx.Rollback()
stmt, err := tx.Prepare(`
INSERT INTO scraped_data (url, title, content, metadata, scraped_at)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (url) DO UPDATE SET
title = EXCLUDED.title,
content = EXCLUDED.content,
metadata = EXCLUDED.metadata,
scraped_at = EXCLUDED.scraped_at
`)
if err != nil {
return err
}
defer stmt.Close()
for _, data := range b.batch {
metadataJSON, _ := json.Marshal(data.Metadata)
_, err := stmt.Exec(data.URL, data.Title, data.Content,
string(metadataJSON), data.ScrapedAt)
if err != nil {
return err
}
}
if err := tx.Commit(); err != nil {
return err
}
b.batch = b.batch[:0] // Clear batch
return nil
}
func (b *BatchProcessor) Flush() error {
b.mutex.Lock()
defer b.mutex.Unlock()
return b.flush()
}
Best Practices
1. Data Validation and Sanitization
import (
"regexp"
"strings"
"unicode/utf8"
)
func ValidateAndSanitize(data *ScrapedData) error {
// Validate URL
if data.URL == "" {
return fmt.Errorf("URL cannot be empty")
}
// Sanitize and validate content
data.Title = strings.TrimSpace(data.Title)
data.Content = sanitizeContent(data.Content)
// Ensure valid UTF-8
if !utf8.ValidString(data.Content) {
data.Content = strings.ToValidUTF8(data.Content, "")
}
// Limit content size
if len(data.Content) > 1000000 { // 1MB limit
data.Content = data.Content[:1000000]
}
return nil
}
func sanitizeContent(content string) string {
// Remove or replace problematic characters
reg := regexp.MustCompile(`[^\p{L}\p{N}\p{P}\p{Z}]`)
return reg.ReplaceAllString(content, "")
}
2. Error Handling and Retry Logic
import (
"context"
"time"
)
func (p *PostgreSQLStorage) StoreWithRetry(data ScrapedData, maxRetries int) error {
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
// Exponential backoff
delay := time.Duration(attempt*attempt) * time.Second
time.Sleep(delay)
}
if err := ValidateAndSanitize(&data); err != nil {
return err // Don't retry validation errors
}
if err := p.Store(data); err != nil {
lastErr = err
continue
}
return nil // Success
}
return fmt.Errorf("failed after %d attempts: %w", maxRetries, lastErr)
}
Performance Optimization Tips
- Use Connection Pooling: Configure appropriate pool sizes for database connections
- Implement Indexing: Create indexes on frequently queried columns
- Batch Operations: Use batch inserts/updates when processing large datasets
- Cache Frequently Accessed Data: Use Redis or in-memory caching for hot data
- Compress Large Content: Store compressed data for large text content
- Partition Large Tables: Consider table partitioning for time-series data
Storage Decision Matrix
| Storage Type | Best For | Pros | Cons | |-------------|----------|------|------| | PostgreSQL | Structured data, complex queries | ACID compliance, powerful queries | Setup complexity, resource usage | | MongoDB | Flexible schemas, rapid development | Schema flexibility, easy scaling | Eventual consistency, memory usage | | File System | Simple projects, human-readable output | Simple, portable, no dependencies | Limited query capabilities, concurrent access issues | | Redis | Caching, temporary storage | Fast access, built-in data structures | Memory limitations, data volatility |
Conclusion
The best storage approach for Go web scraping applications depends on your specific requirements. For small-scale projects, file-based storage might suffice. For production applications requiring complex queries and ACID compliance, PostgreSQL is excellent. MongoDB works well for flexible schemas, while Redis is perfect for caching and temporary storage.
Consider implementing a hybrid approach that combines multiple storage types to leverage the strengths of each. Always include proper error handling, data validation, and performance optimization techniques in your storage implementation.
When building large-scale scraping applications, you might also want to consider implementing rate limiting in Go web scraping applications and handling concurrent web scraping in Go to ensure your data storage layer can handle the load efficiently.