Table of contents

What is the Best Way to Store Scraped Data in Go Applications?

Storing scraped data efficiently is crucial for Go web scraping applications. The choice of storage method depends on factors like data volume, query requirements, performance needs, and persistence requirements. This guide explores various storage options with practical implementations.

Storage Options Overview

1. Relational Databases (PostgreSQL, MySQL)

Relational databases are ideal for structured data with complex relationships and when you need ACID compliance.

PostgreSQL Implementation

package main

import (
    "database/sql"
    "encoding/json"
    "fmt"
    "log"
    "time"

    _ "github.com/lib/pq"
)

type ScrapedData struct {
    ID          int       `json:"id"`
    URL         string    `json:"url"`
    Title       string    `json:"title"`
    Content     string    `json:"content"`
    Metadata    string    `json:"metadata"` // JSON field
    ScrapedAt   time.Time `json:"scraped_at"`
}

type PostgreSQLStorage struct {
    db *sql.DB
}

func NewPostgreSQLStorage(connectionString string) (*PostgreSQLStorage, error) {
    db, err := sql.Open("postgres", connectionString)
    if err != nil {
        return nil, err
    }

    storage := &PostgreSQLStorage{db: db}
    if err := storage.createTables(); err != nil {
        return nil, err
    }

    return storage, nil
}

func (p *PostgreSQLStorage) createTables() error {
    query := `
    CREATE TABLE IF NOT EXISTS scraped_data (
        id SERIAL PRIMARY KEY,
        url VARCHAR(2048) NOT NULL,
        title TEXT,
        content TEXT,
        metadata JSONB,
        scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        UNIQUE(url)
    );
    CREATE INDEX IF NOT EXISTS idx_scraped_at ON scraped_data(scraped_at);
    CREATE INDEX IF NOT EXISTS idx_url ON scraped_data(url);
    `

    _, err := p.db.Exec(query)
    return err
}

func (p *PostgreSQLStorage) Store(data ScrapedData) error {
    query := `
    INSERT INTO scraped_data (url, title, content, metadata, scraped_at)
    VALUES ($1, $2, $3, $4, $5)
    ON CONFLICT (url) DO UPDATE SET
        title = EXCLUDED.title,
        content = EXCLUDED.content,
        metadata = EXCLUDED.metadata,
        scraped_at = EXCLUDED.scraped_at
    `

    metadataJSON, _ := json.Marshal(data.Metadata)

    _, err := p.db.Exec(query, data.URL, data.Title, data.Content, 
                       string(metadataJSON), data.ScrapedAt)
    return err
}

func (p *PostgreSQLStorage) GetByURL(url string) (*ScrapedData, error) {
    query := `
    SELECT id, url, title, content, metadata, scraped_at
    FROM scraped_data WHERE url = $1
    `

    var data ScrapedData
    var metadataJSON string

    err := p.db.QueryRow(query, url).Scan(
        &data.ID, &data.URL, &data.Title, &data.Content,
        &metadataJSON, &data.ScrapedAt,
    )

    if err != nil {
        return nil, err
    }

    json.Unmarshal([]byte(metadataJSON), &data.Metadata)
    return &data, nil
}

2. NoSQL Databases (MongoDB)

MongoDB is excellent for flexible schema requirements and rapid development.

package main

import (
    "context"
    "time"

    "go.mongodb.org/mongo-driver/bson"
    "go.mongodb.org/mongo-driver/mongo"
    "go.mongodb.org/mongo-driver/mongo/options"
)

type MongoStorage struct {
    client     *mongo.Client
    collection *mongo.Collection
}

type MongoDocument struct {
    ID        interface{} `bson:"_id,omitempty"`
    URL       string      `bson:"url"`
    Title     string      `bson:"title"`
    Content   string      `bson:"content"`
    Tags      []string    `bson:"tags"`
    Metadata  interface{} `bson:"metadata"`
    ScrapedAt time.Time   `bson:"scraped_at"`
}

func NewMongoStorage(uri, database, collection string) (*MongoStorage, error) {
    client, err := mongo.Connect(context.TODO(), options.Client().ApplyURI(uri))
    if err != nil {
        return nil, err
    }

    coll := client.Database(database).Collection(collection)

    // Create indexes
    indexModel := mongo.IndexModel{
        Keys: bson.D{{"url", 1}},
        Options: options.Index().SetUnique(true),
    }

    _, err = coll.Indexes().CreateOne(context.TODO(), indexModel)
    if err != nil {
        return nil, err
    }

    return &MongoStorage{
        client:     client,
        collection: coll,
    }, nil
}

func (m *MongoStorage) Store(doc MongoDocument) error {
    filter := bson.D{{"url", doc.URL}}
    update := bson.D{{"$set", doc}}
    opts := options.Update().SetUpsert(true)

    _, err := m.collection.UpdateOne(context.TODO(), filter, update, opts)
    return err
}

func (m *MongoStorage) FindByTags(tags []string) ([]MongoDocument, error) {
    filter := bson.D{{"tags", bson.D{{"$in", tags}}}}
    cursor, err := m.collection.Find(context.TODO(), filter)
    if err != nil {
        return nil, err
    }
    defer cursor.Close(context.TODO())

    var results []MongoDocument
    if err = cursor.All(context.TODO(), &results); err != nil {
        return nil, err
    }

    return results, nil
}

func (m *MongoStorage) BulkInsert(docs []MongoDocument) error {
    var operations []mongo.WriteModel

    for _, doc := range docs {
        operation := mongo.NewUpdateOneModel()
        operation.SetFilter(bson.D{{"url", doc.URL}})
        operation.SetUpdate(bson.D{{"$set", doc}})
        operation.SetUpsert(true)
        operations = append(operations, operation)
    }

    _, err := m.collection.BulkWrite(context.TODO(), operations)
    return err
}

3. File-Based Storage

For simple applications or when you need human-readable output, file-based storage can be effective.

JSON File Storage

package main

import (
    "encoding/json"
    "fmt"
    "io/ioutil"
    "os"
    "path/filepath"
    "sync"
    "time"
)

type FileStorage struct {
    basePath string
    mutex    sync.RWMutex
}

type FileData struct {
    URL       string                 `json:"url"`
    Title     string                 `json:"title"`
    Content   string                 `json:"content"`
    Metadata  map[string]interface{} `json:"metadata"`
    ScrapedAt time.Time             `json:"scraped_at"`
}

func NewFileStorage(basePath string) *FileStorage {
    os.MkdirAll(basePath, 0755)
    return &FileStorage{basePath: basePath}
}

func (f *FileStorage) Store(data FileData) error {
    f.mutex.Lock()
    defer f.mutex.Unlock()

    // Create filename based on URL hash or timestamp
    filename := fmt.Sprintf("%d_%s.json", 
        time.Now().Unix(), 
        sanitizeFilename(data.URL))

    filepath := filepath.Join(f.basePath, filename)

    jsonData, err := json.MarshalIndent(data, "", "  ")
    if err != nil {
        return err
    }

    return ioutil.WriteFile(filepath, jsonData, 0644)
}

func (f *FileStorage) StoreCSV(data []map[string]string, filename string) error {
    f.mutex.Lock()
    defer f.mutex.Unlock()

    if len(data) == 0 {
        return nil
    }

    filepath := filepath.Join(f.basePath, filename)
    file, err := os.Create(filepath)
    if err != nil {
        return err
    }
    defer file.Close()

    // Write CSV header
    var headers []string
    for key := range data[0] {
        headers = append(headers, key)
    }

    for i, header := range headers {
        if i > 0 {
            file.WriteString(",")
        }
        file.WriteString(header)
    }
    file.WriteString("\n")

    // Write CSV data
    for _, row := range data {
        for i, header := range headers {
            if i > 0 {
                file.WriteString(",")
            }
            file.WriteString(fmt.Sprintf(`"%s"`, row[header]))
        }
        file.WriteString("\n")
    }

    return nil
}

func sanitizeFilename(url string) string {
    // Simple sanitization - replace problematic characters
    result := ""
    for _, char := range url {
        if (char >= 'a' && char <= 'z') || 
           (char >= 'A' && char <= 'Z') || 
           (char >= '0' && char <= '9') {
            result += string(char)
        } else {
            result += "_"
        }
    }
    if len(result) > 50 {
        result = result[:50]
    }
    return result
}

4. In-Memory Storage with Redis

Redis is perfect for caching and temporary storage of scraped data.

package main

import (
    "encoding/json"
    "time"

    "github.com/go-redis/redis/v8"
    "golang.org/x/net/context"
)

type RedisStorage struct {
    client *redis.Client
}

func NewRedisStorage(addr, password string, db int) *RedisStorage {
    rdb := redis.NewClient(&redis.Options{
        Addr:     addr,
        Password: password,
        DB:       db,
    })

    return &RedisStorage{client: rdb}
}

func (r *RedisStorage) StoreWithExpiration(key string, data interface{}, expiration time.Duration) error {
    jsonData, err := json.Marshal(data)
    if err != nil {
        return err
    }

    return r.client.Set(context.Background(), key, jsonData, expiration).Err()
}

func (r *RedisStorage) Get(key string, dest interface{}) error {
    val, err := r.client.Get(context.Background(), key).Result()
    if err != nil {
        return err
    }

    return json.Unmarshal([]byte(val), dest)
}

func (r *RedisStorage) AddToSet(setKey, member string) error {
    return r.client.SAdd(context.Background(), setKey, member).Err()
}

func (r *RedisStorage) GetSetMembers(setKey string) ([]string, error) {
    return r.client.SMembers(context.Background(), setKey).Result()
}

// Useful for deduplication
func (r *RedisStorage) IsURLProcessed(url string) (bool, error) {
    exists, err := r.client.Exists(context.Background(), "processed:"+url).Result()
    return exists > 0, err
}

func (r *RedisStorage) MarkURLProcessed(url string, expiration time.Duration) error {
    return r.client.Set(context.Background(), "processed:"+url, "1", expiration).Err()
}

Advanced Storage Patterns

1. Hybrid Storage Strategy

Combine multiple storage types for optimal performance:

type HybridStorage struct {
    redis    *RedisStorage
    postgres *PostgreSQLStorage
    files    *FileStorage
}

func (h *HybridStorage) Store(data ScrapedData) error {
    // Store in Redis for quick access
    h.redis.StoreWithExpiration(
        fmt.Sprintf("cache:%s", data.URL), 
        data, 
        24*time.Hour,
    )

    // Store in PostgreSQL for persistence
    if err := h.postgres.Store(data); err != nil {
        return err
    }

    // Store critical data in files as backup
    if data.Title != "" {
        fileData := FileData{
            URL:       data.URL,
            Title:     data.Title,
            Content:   data.Content,
            ScrapedAt: data.ScrapedAt,
        }
        h.files.Store(fileData)
    }

    return nil
}

2. Batch Processing for Performance

type BatchProcessor struct {
    storage   *PostgreSQLStorage
    batch     []ScrapedData
    batchSize int
    mutex     sync.Mutex
}

func NewBatchProcessor(storage *PostgreSQLStorage, batchSize int) *BatchProcessor {
    return &BatchProcessor{
        storage:   storage,
        batchSize: batchSize,
        batch:     make([]ScrapedData, 0, batchSize),
    }
}

func (b *BatchProcessor) Add(data ScrapedData) error {
    b.mutex.Lock()
    defer b.mutex.Unlock()

    b.batch = append(b.batch, data)

    if len(b.batch) >= b.batchSize {
        return b.flush()
    }

    return nil
}

func (b *BatchProcessor) flush() error {
    if len(b.batch) == 0 {
        return nil
    }

    tx, err := b.storage.db.Begin()
    if err != nil {
        return err
    }
    defer tx.Rollback()

    stmt, err := tx.Prepare(`
        INSERT INTO scraped_data (url, title, content, metadata, scraped_at)
        VALUES ($1, $2, $3, $4, $5)
        ON CONFLICT (url) DO UPDATE SET
            title = EXCLUDED.title,
            content = EXCLUDED.content,
            metadata = EXCLUDED.metadata,
            scraped_at = EXCLUDED.scraped_at
    `)
    if err != nil {
        return err
    }
    defer stmt.Close()

    for _, data := range b.batch {
        metadataJSON, _ := json.Marshal(data.Metadata)
        _, err := stmt.Exec(data.URL, data.Title, data.Content, 
                           string(metadataJSON), data.ScrapedAt)
        if err != nil {
            return err
        }
    }

    if err := tx.Commit(); err != nil {
        return err
    }

    b.batch = b.batch[:0] // Clear batch
    return nil
}

func (b *BatchProcessor) Flush() error {
    b.mutex.Lock()
    defer b.mutex.Unlock()
    return b.flush()
}

Best Practices

1. Data Validation and Sanitization

import (
    "regexp"
    "strings"
    "unicode/utf8"
)

func ValidateAndSanitize(data *ScrapedData) error {
    // Validate URL
    if data.URL == "" {
        return fmt.Errorf("URL cannot be empty")
    }

    // Sanitize and validate content
    data.Title = strings.TrimSpace(data.Title)
    data.Content = sanitizeContent(data.Content)

    // Ensure valid UTF-8
    if !utf8.ValidString(data.Content) {
        data.Content = strings.ToValidUTF8(data.Content, "")
    }

    // Limit content size
    if len(data.Content) > 1000000 { // 1MB limit
        data.Content = data.Content[:1000000]
    }

    return nil
}

func sanitizeContent(content string) string {
    // Remove or replace problematic characters
    reg := regexp.MustCompile(`[^\p{L}\p{N}\p{P}\p{Z}]`)
    return reg.ReplaceAllString(content, "")
}

2. Error Handling and Retry Logic

import (
    "context"
    "time"
)

func (p *PostgreSQLStorage) StoreWithRetry(data ScrapedData, maxRetries int) error {
    var lastErr error

    for attempt := 0; attempt <= maxRetries; attempt++ {
        if attempt > 0 {
            // Exponential backoff
            delay := time.Duration(attempt*attempt) * time.Second
            time.Sleep(delay)
        }

        if err := ValidateAndSanitize(&data); err != nil {
            return err // Don't retry validation errors
        }

        if err := p.Store(data); err != nil {
            lastErr = err
            continue
        }

        return nil // Success
    }

    return fmt.Errorf("failed after %d attempts: %w", maxRetries, lastErr)
}

Performance Optimization Tips

  1. Use Connection Pooling: Configure appropriate pool sizes for database connections
  2. Implement Indexing: Create indexes on frequently queried columns
  3. Batch Operations: Use batch inserts/updates when processing large datasets
  4. Cache Frequently Accessed Data: Use Redis or in-memory caching for hot data
  5. Compress Large Content: Store compressed data for large text content
  6. Partition Large Tables: Consider table partitioning for time-series data

Storage Decision Matrix

| Storage Type | Best For | Pros | Cons | |-------------|----------|------|------| | PostgreSQL | Structured data, complex queries | ACID compliance, powerful queries | Setup complexity, resource usage | | MongoDB | Flexible schemas, rapid development | Schema flexibility, easy scaling | Eventual consistency, memory usage | | File System | Simple projects, human-readable output | Simple, portable, no dependencies | Limited query capabilities, concurrent access issues | | Redis | Caching, temporary storage | Fast access, built-in data structures | Memory limitations, data volatility |

Conclusion

The best storage approach for Go web scraping applications depends on your specific requirements. For small-scale projects, file-based storage might suffice. For production applications requiring complex queries and ACID compliance, PostgreSQL is excellent. MongoDB works well for flexible schemas, while Redis is perfect for caching and temporary storage.

Consider implementing a hybrid approach that combines multiple storage types to leverage the strengths of each. Always include proper error handling, data validation, and performance optimization techniques in your storage implementation.

When building large-scale scraping applications, you might also want to consider implementing rate limiting in Go web scraping applications and handling concurrent web scraping in Go to ensure your data storage layer can handle the load efficiently.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon