Table of contents

How do I handle file downloads in Go web scraping?

File downloads are a common requirement in web scraping applications, whether you're downloading images, documents, datasets, or other media files. Go provides excellent built-in support for handling HTTP requests and file operations, making it well-suited for implementing robust file download functionality in your web scraping projects.

Basic File Download with net/http

The simplest approach to downloading files in Go uses the standard net/http package. Here's a basic implementation:

package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "path/filepath"
)

func downloadFile(url, filename string) error {
    // Create the HTTP request
    resp, err := http.Get(url)
    if err != nil {
        return fmt.Errorf("failed to make request: %w", err)
    }
    defer resp.Body.Close()

    // Check if the request was successful
    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("bad status: %s", resp.Status)
    }

    // Create the destination file
    out, err := os.Create(filename)
    if err != nil {
        return fmt.Errorf("failed to create file: %w", err)
    }
    defer out.Close()

    // Copy the response body to the file
    _, err = io.Copy(out, resp.Body)
    if err != nil {
        return fmt.Errorf("failed to write file: %w", err)
    }

    return nil
}

func main() {
    url := "https://example.com/document.pdf"
    filename := "downloaded_document.pdf"

    if err := downloadFile(url, filename); err != nil {
        fmt.Printf("Error downloading file: %v\n", err)
        return
    }

    fmt.Println("File downloaded successfully!")
}

Advanced File Download with Custom HTTP Client

For production applications, you'll want more control over the HTTP client configuration, including timeouts, headers, and connection pooling:

package main

import (
    "context"
    "fmt"
    "io"
    "net/http"
    "os"
    "path/filepath"
    "time"
)

type FileDownloader struct {
    client *http.Client
}

func NewFileDownloader() *FileDownloader {
    return &FileDownloader{
        client: &http.Client{
            Timeout: 30 * time.Second,
            Transport: &http.Transport{
                MaxIdleConns:        100,
                MaxIdleConnsPerHost: 10,
                IdleConnTimeout:     90 * time.Second,
            },
        },
    }
}

func (fd *FileDownloader) DownloadFile(ctx context.Context, url, destPath string) error {
    // Create request with context
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return fmt.Errorf("failed to create request: %w", err)
    }

    // Set user agent to avoid blocking
    req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; GoScraper/1.0)")

    // Execute request
    resp, err := fd.client.Do(req)
    if err != nil {
        return fmt.Errorf("failed to execute request: %w", err)
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("unexpected status code: %d", resp.StatusCode)
    }

    // Create destination directory if it doesn't exist
    if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil {
        return fmt.Errorf("failed to create directory: %w", err)
    }

    // Create destination file
    file, err := os.Create(destPath)
    if err != nil {
        return fmt.Errorf("failed to create file: %w", err)
    }
    defer file.Close()

    // Copy with progress tracking
    _, err = io.Copy(file, resp.Body)
    if err != nil {
        return fmt.Errorf("failed to copy data: %w", err)
    }

    return nil
}

func main() {
    downloader := NewFileDownloader()
    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
    defer cancel()

    url := "https://example.com/large-file.zip"
    destPath := "./downloads/large-file.zip"

    if err := downloader.DownloadFile(ctx, url, destPath); err != nil {
        fmt.Printf("Download failed: %v\n", err)
        return
    }

    fmt.Println("Download completed successfully!")
}

Streaming Large Files with Progress Tracking

When downloading large files, it's important to implement streaming and progress tracking to avoid memory issues:

package main

import (
    "context"
    "fmt"
    "io"
    "net/http"
    "os"
    "strconv"
    "time"
)

type ProgressReader struct {
    reader       io.Reader
    total        int64
    downloaded   int64
    onProgress   func(downloaded, total int64)
}

func (pr *ProgressReader) Read(p []byte) (int, error) {
    n, err := pr.reader.Read(p)
    pr.downloaded += int64(n)

    if pr.onProgress != nil {
        pr.onProgress(pr.downloaded, pr.total)
    }

    return n, err
}

func downloadWithProgress(ctx context.Context, url, filename string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    client := &http.Client{Timeout: 30 * time.Second}
    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("unexpected status: %s", resp.Status)
    }

    // Get file size from Content-Length header
    var totalSize int64
    if contentLength := resp.Header.Get("Content-Length"); contentLength != "" {
        totalSize, _ = strconv.ParseInt(contentLength, 10, 64)
    }

    file, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer file.Close()

    // Create progress reader
    progressReader := &ProgressReader{
        reader: resp.Body,
        total:  totalSize,
        onProgress: func(downloaded, total int64) {
            if total > 0 {
                percentage := float64(downloaded) / float64(total) * 100
                fmt.Printf("\rProgress: %.2f%% (%d/%d bytes)", 
                    percentage, downloaded, total)
            } else {
                fmt.Printf("\rDownloaded: %d bytes", downloaded)
            }
        },
    }

    _, err = io.Copy(file, progressReader)
    if err != nil {
        return err
    }

    fmt.Println("\nDownload completed!")
    return nil
}

Handling Authentication and Headers

Many file downloads require authentication or specific headers. Here's how to handle various authentication scenarios:

package main

import (
    "context"
    "fmt"
    "io"
    "net/http"
    "os"
)

type AuthenticatedDownloader struct {
    client *http.Client
    token  string
}

func NewAuthenticatedDownloader(token string) *AuthenticatedDownloader {
    return &AuthenticatedDownloader{
        client: &http.Client{},
        token:  token,
    }
}

func (ad *AuthenticatedDownloader) DownloadWithAuth(ctx context.Context, url, filename string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    // Add authentication headers
    req.Header.Set("Authorization", "Bearer "+ad.token)
    req.Header.Set("User-Agent", "GoDownloader/1.0")
    req.Header.Set("Accept", "*/*")

    resp, err := ad.client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode == http.StatusUnauthorized {
        return fmt.Errorf("authentication failed")
    }

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("unexpected status: %s", resp.Status)
    }

    file, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer file.Close()

    _, err = io.Copy(file, resp.Body)
    return err
}

// Download with basic authentication
func downloadWithBasicAuth(ctx context.Context, url, username, password, filename string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    req.SetBasicAuth(username, password)

    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("unexpected status: %s", resp.Status)
    }

    file, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer file.Close()

    _, err = io.Copy(file, resp.Body)
    return err
}

Concurrent File Downloads

For downloading multiple files efficiently, implement concurrent downloads with proper error handling and rate limiting:

package main

import (
    "context"
    "fmt"
    "io"
    "net/http"
    "os"
    "path/filepath"
    "sync"
    "time"
)

type DownloadJob struct {
    URL      string
    Filename string
}

type DownloadResult struct {
    Job   DownloadJob
    Error error
}

type ConcurrentDownloader struct {
    client      *http.Client
    maxWorkers  int
    rateLimiter chan struct{}
}

func NewConcurrentDownloader(maxWorkers int, requestsPerSecond int) *ConcurrentDownloader {
    rateLimiter := make(chan struct{}, requestsPerSecond)

    // Fill rate limiter
    go func() {
        ticker := time.NewTicker(time.Second / time.Duration(requestsPerSecond))
        defer ticker.Stop()

        for {
            select {
            case rateLimiter <- struct{}{}:
            case <-ticker.C:
                // Remove one token per tick
                select {
                case <-rateLimiter:
                default:
                }
            }
        }
    }()

    return &ConcurrentDownloader{
        client: &http.Client{
            Timeout: 30 * time.Second,
        },
        maxWorkers:  maxWorkers,
        rateLimiter: rateLimiter,
    }
}

func (cd *ConcurrentDownloader) DownloadFiles(ctx context.Context, jobs []DownloadJob) []DownloadResult {
    jobChan := make(chan DownloadJob, len(jobs))
    resultChan := make(chan DownloadResult, len(jobs))

    // Start workers
    var wg sync.WaitGroup
    for i := 0; i < cd.maxWorkers; i++ {
        wg.Add(1)
        go cd.worker(ctx, &wg, jobChan, resultChan)
    }

    // Send jobs
    for _, job := range jobs {
        jobChan <- job
    }
    close(jobChan)

    // Wait for completion
    go func() {
        wg.Wait()
        close(resultChan)
    }()

    // Collect results
    var results []DownloadResult
    for result := range resultChan {
        results = append(results, result)
    }

    return results
}

func (cd *ConcurrentDownloader) worker(ctx context.Context, wg *sync.WaitGroup, jobs <-chan DownloadJob, results chan<- DownloadResult) {
    defer wg.Done()

    for job := range jobs {
        // Rate limiting
        <-cd.rateLimiter

        err := cd.downloadSingleFile(ctx, job.URL, job.Filename)
        results <- DownloadResult{Job: job, Error: err}
    }
}

func (cd *ConcurrentDownloader) downloadSingleFile(ctx context.Context, url, filename string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    resp, err := cd.client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("unexpected status: %s", resp.Status)
    }

    // Create directory if needed
    if err := os.MkdirAll(filepath.Dir(filename), 0755); err != nil {
        return err
    }

    file, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer file.Close()

    _, err = io.Copy(file, resp.Body)
    return err
}

func main() {
    downloader := NewConcurrentDownloader(5, 2) // 5 workers, 2 requests/second

    jobs := []DownloadJob{
        {URL: "https://example.com/file1.pdf", Filename: "./downloads/file1.pdf"},
        {URL: "https://example.com/file2.jpg", Filename: "./downloads/file2.jpg"},
        {URL: "https://example.com/file3.txt", Filename: "./downloads/file3.txt"},
    }

    ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
    defer cancel()

    results := downloader.DownloadFiles(ctx, jobs)

    for _, result := range results {
        if result.Error != nil {
            fmt.Printf("Failed to download %s: %v\n", result.Job.URL, result.Error)
        } else {
            fmt.Printf("Successfully downloaded %s\n", result.Job.Filename)
        }
    }
}

Error Handling and Retry Logic

Implement robust error handling with exponential backoff retry logic:

package main

import (
    "context"
    "fmt"
    "io"
    "math"
    "net/http"
    "os"
    "time"
)

type RetryConfig struct {
    MaxRetries int
    BaseDelay  time.Duration
    MaxDelay   time.Duration
}

func downloadWithRetry(ctx context.Context, url, filename string, config RetryConfig) error {
    var lastErr error

    for attempt := 0; attempt <= config.MaxRetries; attempt++ {
        if attempt > 0 {
            // Calculate exponential backoff delay
            delay := time.Duration(math.Pow(2, float64(attempt-1))) * config.BaseDelay
            if delay > config.MaxDelay {
                delay = config.MaxDelay
            }

            fmt.Printf("Retrying in %v... (attempt %d/%d)\n", delay, attempt, config.MaxRetries)

            select {
            case <-time.After(delay):
            case <-ctx.Done():
                return ctx.Err()
            }
        }

        lastErr = downloadFileOnce(ctx, url, filename)
        if lastErr == nil {
            return nil // Success
        }

        fmt.Printf("Download attempt %d failed: %v\n", attempt+1, lastErr)
    }

    return fmt.Errorf("download failed after %d attempts: %w", config.MaxRetries+1, lastErr)
}

func downloadFileOnce(ctx context.Context, url, filename string) error {
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return err
    }

    client := &http.Client{Timeout: 30 * time.Second}
    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
    }

    file, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer file.Close()

    _, err = io.Copy(file, resp.Body)
    return err
}

func main() {
    config := RetryConfig{
        MaxRetries: 3,
        BaseDelay:  time.Second,
        MaxDelay:   10 * time.Second,
    }

    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
    defer cancel()

    url := "https://example.com/unreliable-file.pdf"
    filename := "./downloads/file.pdf"

    if err := downloadWithRetry(ctx, url, filename, config); err != nil {
        fmt.Printf("Final error: %v\n", err)
        return
    }

    fmt.Println("Download completed successfully!")
}

Best Practices for File Downloads in Go

  1. Always use context for cancellation: Implement proper context handling to allow for graceful cancellation of long-running downloads.

  2. Implement proper error handling: Check HTTP status codes and handle network errors appropriately.

  3. Use streaming for large files: Avoid loading entire files into memory by using io.Copy for streaming downloads.

  4. Set appropriate timeouts: Configure reasonable timeouts for both connection and overall request duration.

  5. Implement rate limiting: Respect server resources by limiting concurrent requests and implementing delays between requests.

  6. Validate file integrity: Consider implementing checksum validation for critical file downloads.

  7. Handle partial downloads: Implement resume functionality for large files that might be interrupted.

  8. Use proper file permissions: Set appropriate file permissions when creating downloaded files.

Integration with Web Scraping Workflows

File downloads often integrate with broader web scraping workflows. When building applications that need to download files discovered during scraping, consider using headless browser automation for JavaScript-heavy sites, or implementing proper authentication handling when files are behind login walls.

For complex scraping scenarios involving file downloads, you might also need to handle timeouts effectively to ensure your download operations complete successfully even when dealing with slow or unreliable sources.

By following these patterns and best practices, you can build robust file download functionality in your Go web scraping applications that handles edge cases gracefully and performs well at scale.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon