How do I handle file downloads in Go web scraping?
File downloads are a common requirement in web scraping applications, whether you're downloading images, documents, datasets, or other media files. Go provides excellent built-in support for handling HTTP requests and file operations, making it well-suited for implementing robust file download functionality in your web scraping projects.
Basic File Download with net/http
The simplest approach to downloading files in Go uses the standard net/http
package. Here's a basic implementation:
package main
import (
"fmt"
"io"
"net/http"
"os"
"path/filepath"
)
func downloadFile(url, filename string) error {
// Create the HTTP request
resp, err := http.Get(url)
if err != nil {
return fmt.Errorf("failed to make request: %w", err)
}
defer resp.Body.Close()
// Check if the request was successful
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("bad status: %s", resp.Status)
}
// Create the destination file
out, err := os.Create(filename)
if err != nil {
return fmt.Errorf("failed to create file: %w", err)
}
defer out.Close()
// Copy the response body to the file
_, err = io.Copy(out, resp.Body)
if err != nil {
return fmt.Errorf("failed to write file: %w", err)
}
return nil
}
func main() {
url := "https://example.com/document.pdf"
filename := "downloaded_document.pdf"
if err := downloadFile(url, filename); err != nil {
fmt.Printf("Error downloading file: %v\n", err)
return
}
fmt.Println("File downloaded successfully!")
}
Advanced File Download with Custom HTTP Client
For production applications, you'll want more control over the HTTP client configuration, including timeouts, headers, and connection pooling:
package main
import (
"context"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"time"
)
type FileDownloader struct {
client *http.Client
}
func NewFileDownloader() *FileDownloader {
return &FileDownloader{
client: &http.Client{
Timeout: 30 * time.Second,
Transport: &http.Transport{
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
},
},
}
}
func (fd *FileDownloader) DownloadFile(ctx context.Context, url, destPath string) error {
// Create request with context
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
// Set user agent to avoid blocking
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; GoScraper/1.0)")
// Execute request
resp, err := fd.client.Do(req)
if err != nil {
return fmt.Errorf("failed to execute request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// Create destination directory if it doesn't exist
if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil {
return fmt.Errorf("failed to create directory: %w", err)
}
// Create destination file
file, err := os.Create(destPath)
if err != nil {
return fmt.Errorf("failed to create file: %w", err)
}
defer file.Close()
// Copy with progress tracking
_, err = io.Copy(file, resp.Body)
if err != nil {
return fmt.Errorf("failed to copy data: %w", err)
}
return nil
}
func main() {
downloader := NewFileDownloader()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
url := "https://example.com/large-file.zip"
destPath := "./downloads/large-file.zip"
if err := downloader.DownloadFile(ctx, url, destPath); err != nil {
fmt.Printf("Download failed: %v\n", err)
return
}
fmt.Println("Download completed successfully!")
}
Streaming Large Files with Progress Tracking
When downloading large files, it's important to implement streaming and progress tracking to avoid memory issues:
package main
import (
"context"
"fmt"
"io"
"net/http"
"os"
"strconv"
"time"
)
type ProgressReader struct {
reader io.Reader
total int64
downloaded int64
onProgress func(downloaded, total int64)
}
func (pr *ProgressReader) Read(p []byte) (int, error) {
n, err := pr.reader.Read(p)
pr.downloaded += int64(n)
if pr.onProgress != nil {
pr.onProgress(pr.downloaded, pr.total)
}
return n, err
}
func downloadWithProgress(ctx context.Context, url, filename string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status: %s", resp.Status)
}
// Get file size from Content-Length header
var totalSize int64
if contentLength := resp.Header.Get("Content-Length"); contentLength != "" {
totalSize, _ = strconv.ParseInt(contentLength, 10, 64)
}
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
// Create progress reader
progressReader := &ProgressReader{
reader: resp.Body,
total: totalSize,
onProgress: func(downloaded, total int64) {
if total > 0 {
percentage := float64(downloaded) / float64(total) * 100
fmt.Printf("\rProgress: %.2f%% (%d/%d bytes)",
percentage, downloaded, total)
} else {
fmt.Printf("\rDownloaded: %d bytes", downloaded)
}
},
}
_, err = io.Copy(file, progressReader)
if err != nil {
return err
}
fmt.Println("\nDownload completed!")
return nil
}
Handling Authentication and Headers
Many file downloads require authentication or specific headers. Here's how to handle various authentication scenarios:
package main
import (
"context"
"fmt"
"io"
"net/http"
"os"
)
type AuthenticatedDownloader struct {
client *http.Client
token string
}
func NewAuthenticatedDownloader(token string) *AuthenticatedDownloader {
return &AuthenticatedDownloader{
client: &http.Client{},
token: token,
}
}
func (ad *AuthenticatedDownloader) DownloadWithAuth(ctx context.Context, url, filename string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
// Add authentication headers
req.Header.Set("Authorization", "Bearer "+ad.token)
req.Header.Set("User-Agent", "GoDownloader/1.0")
req.Header.Set("Accept", "*/*")
resp, err := ad.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusUnauthorized {
return fmt.Errorf("authentication failed")
}
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status: %s", resp.Status)
}
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, resp.Body)
return err
}
// Download with basic authentication
func downloadWithBasicAuth(ctx context.Context, url, username, password, filename string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
req.SetBasicAuth(username, password)
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status: %s", resp.Status)
}
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, resp.Body)
return err
}
Concurrent File Downloads
For downloading multiple files efficiently, implement concurrent downloads with proper error handling and rate limiting:
package main
import (
"context"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"sync"
"time"
)
type DownloadJob struct {
URL string
Filename string
}
type DownloadResult struct {
Job DownloadJob
Error error
}
type ConcurrentDownloader struct {
client *http.Client
maxWorkers int
rateLimiter chan struct{}
}
func NewConcurrentDownloader(maxWorkers int, requestsPerSecond int) *ConcurrentDownloader {
rateLimiter := make(chan struct{}, requestsPerSecond)
// Fill rate limiter
go func() {
ticker := time.NewTicker(time.Second / time.Duration(requestsPerSecond))
defer ticker.Stop()
for {
select {
case rateLimiter <- struct{}{}:
case <-ticker.C:
// Remove one token per tick
select {
case <-rateLimiter:
default:
}
}
}
}()
return &ConcurrentDownloader{
client: &http.Client{
Timeout: 30 * time.Second,
},
maxWorkers: maxWorkers,
rateLimiter: rateLimiter,
}
}
func (cd *ConcurrentDownloader) DownloadFiles(ctx context.Context, jobs []DownloadJob) []DownloadResult {
jobChan := make(chan DownloadJob, len(jobs))
resultChan := make(chan DownloadResult, len(jobs))
// Start workers
var wg sync.WaitGroup
for i := 0; i < cd.maxWorkers; i++ {
wg.Add(1)
go cd.worker(ctx, &wg, jobChan, resultChan)
}
// Send jobs
for _, job := range jobs {
jobChan <- job
}
close(jobChan)
// Wait for completion
go func() {
wg.Wait()
close(resultChan)
}()
// Collect results
var results []DownloadResult
for result := range resultChan {
results = append(results, result)
}
return results
}
func (cd *ConcurrentDownloader) worker(ctx context.Context, wg *sync.WaitGroup, jobs <-chan DownloadJob, results chan<- DownloadResult) {
defer wg.Done()
for job := range jobs {
// Rate limiting
<-cd.rateLimiter
err := cd.downloadSingleFile(ctx, job.URL, job.Filename)
results <- DownloadResult{Job: job, Error: err}
}
}
func (cd *ConcurrentDownloader) downloadSingleFile(ctx context.Context, url, filename string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
resp, err := cd.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status: %s", resp.Status)
}
// Create directory if needed
if err := os.MkdirAll(filepath.Dir(filename), 0755); err != nil {
return err
}
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, resp.Body)
return err
}
func main() {
downloader := NewConcurrentDownloader(5, 2) // 5 workers, 2 requests/second
jobs := []DownloadJob{
{URL: "https://example.com/file1.pdf", Filename: "./downloads/file1.pdf"},
{URL: "https://example.com/file2.jpg", Filename: "./downloads/file2.jpg"},
{URL: "https://example.com/file3.txt", Filename: "./downloads/file3.txt"},
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
results := downloader.DownloadFiles(ctx, jobs)
for _, result := range results {
if result.Error != nil {
fmt.Printf("Failed to download %s: %v\n", result.Job.URL, result.Error)
} else {
fmt.Printf("Successfully downloaded %s\n", result.Job.Filename)
}
}
}
Error Handling and Retry Logic
Implement robust error handling with exponential backoff retry logic:
package main
import (
"context"
"fmt"
"io"
"math"
"net/http"
"os"
"time"
)
type RetryConfig struct {
MaxRetries int
BaseDelay time.Duration
MaxDelay time.Duration
}
func downloadWithRetry(ctx context.Context, url, filename string, config RetryConfig) error {
var lastErr error
for attempt := 0; attempt <= config.MaxRetries; attempt++ {
if attempt > 0 {
// Calculate exponential backoff delay
delay := time.Duration(math.Pow(2, float64(attempt-1))) * config.BaseDelay
if delay > config.MaxDelay {
delay = config.MaxDelay
}
fmt.Printf("Retrying in %v... (attempt %d/%d)\n", delay, attempt, config.MaxRetries)
select {
case <-time.After(delay):
case <-ctx.Done():
return ctx.Err()
}
}
lastErr = downloadFileOnce(ctx, url, filename)
if lastErr == nil {
return nil // Success
}
fmt.Printf("Download attempt %d failed: %v\n", attempt+1, lastErr)
}
return fmt.Errorf("download failed after %d attempts: %w", config.MaxRetries+1, lastErr)
}
func downloadFileOnce(ctx context.Context, url, filename string) error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return err
}
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
}
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(file, resp.Body)
return err
}
func main() {
config := RetryConfig{
MaxRetries: 3,
BaseDelay: time.Second,
MaxDelay: 10 * time.Second,
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
url := "https://example.com/unreliable-file.pdf"
filename := "./downloads/file.pdf"
if err := downloadWithRetry(ctx, url, filename, config); err != nil {
fmt.Printf("Final error: %v\n", err)
return
}
fmt.Println("Download completed successfully!")
}
Best Practices for File Downloads in Go
Always use context for cancellation: Implement proper context handling to allow for graceful cancellation of long-running downloads.
Implement proper error handling: Check HTTP status codes and handle network errors appropriately.
Use streaming for large files: Avoid loading entire files into memory by using
io.Copy
for streaming downloads.Set appropriate timeouts: Configure reasonable timeouts for both connection and overall request duration.
Implement rate limiting: Respect server resources by limiting concurrent requests and implementing delays between requests.
Validate file integrity: Consider implementing checksum validation for critical file downloads.
Handle partial downloads: Implement resume functionality for large files that might be interrupted.
Use proper file permissions: Set appropriate file permissions when creating downloaded files.
Integration with Web Scraping Workflows
File downloads often integrate with broader web scraping workflows. When building applications that need to download files discovered during scraping, consider using headless browser automation for JavaScript-heavy sites, or implementing proper authentication handling when files are behind login walls.
For complex scraping scenarios involving file downloads, you might also need to handle timeouts effectively to ensure your download operations complete successfully even when dealing with slow or unreliable sources.
By following these patterns and best practices, you can build robust file download functionality in your Go web scraping applications that handles edge cases gracefully and performs well at scale.