How do I handle HTTP/2 in Go web scraping?
HTTP/2 is the second major version of the HTTP protocol that offers significant performance improvements over HTTP/1.1, including multiplexing, server push, and header compression. Many modern websites use HTTP/2, making it essential for Go web scrapers to handle this protocol properly. This guide covers everything you need to know about implementing HTTP/2 support in your Go web scraping applications.
Understanding HTTP/2 Benefits for Web Scraping
HTTP/2 provides several advantages for web scraping applications:
- Multiplexing: Multiple requests can be sent over a single connection simultaneously
- Header compression: Reduces bandwidth usage through HPACK compression
- Server push: Servers can proactively send resources to clients
- Binary protocol: More efficient parsing compared to HTTP/1.1's text-based format
- Stream prioritization: Allows prioritizing critical requests
Basic HTTP/2 Client Setup in Go
Go's standard net/http
package includes built-in HTTP/2 support. Here's how to create an HTTP/2-enabled client:
package main
import (
"crypto/tls"
"fmt"
"golang.org/x/net/http2"
"io"
"net/http"
"time"
)
func createHTTP2Client() *http.Client {
// Create a custom transport with HTTP/2 support
transport := &http2.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: false, // Set to true only for testing
},
// Optional: Configure connection pooling
MaxReadFrameSize: 1 << 20, // 1MB
ReadIdleTimeout: 30 * time.Second,
PingTimeout: 15 * time.Second,
}
return &http.Client{
Transport: transport,
Timeout: 30 * time.Second,
}
}
func main() {
client := createHTTP2Client()
resp, err := client.Get("https://httpbin.org/get")
if err != nil {
panic(err)
}
defer resp.Body.Close()
// Check the protocol version
fmt.Printf("Protocol: %s\n", resp.Proto)
body, err := io.ReadAll(resp.Body)
if err != nil {
panic(err)
}
fmt.Printf("Response: %s\n", string(body))
}
Forcing HTTP/2 Usage
To ensure your client uses HTTP/2, you can configure the transport more explicitly:
package main
import (
"crypto/tls"
"fmt"
"golang.org/x/net/http2"
"net/http"
"time"
)
func createForceHTTP2Client() *http.Client {
// Create base HTTP/1.1 transport
tr := &http.Transport{
TLSClientConfig: &tls.Config{
NextProtos: []string{"h2"}, // Force HTTP/2
},
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
}
// Configure HTTP/2
if err := http2.ConfigureTransport(tr); err != nil {
panic(err)
}
return &http.Client{
Transport: tr,
Timeout: 30 * time.Second,
}
}
func scrapeWithHTTP2(url string) error {
client := createForceHTTP2Client()
resp, err := client.Get(url)
if err != nil {
return fmt.Errorf("failed to make request: %w", err)
}
defer resp.Body.Close()
// Verify we're using HTTP/2
if resp.ProtoMajor != 2 {
return fmt.Errorf("expected HTTP/2, got %s", resp.Proto)
}
fmt.Printf("Successfully used %s\n", resp.Proto)
return nil
}
Concurrent Requests with HTTP/2 Multiplexing
HTTP/2's multiplexing allows multiple requests over a single connection. Here's how to leverage this for concurrent web scraping:
package main
import (
"context"
"fmt"
"golang.org/x/net/http2"
"net/http"
"sync"
"time"
)
type ScrapingResult struct {
URL string
Status int
Protocol string
Error error
}
func scrapeMultipleURLs(urls []string) []ScrapingResult {
// Create HTTP/2 client
client := &http.Client{
Transport: &http2.Transport{},
Timeout: 30 * time.Second,
}
results := make([]ScrapingResult, len(urls))
var wg sync.WaitGroup
// Process URLs concurrently
for i, url := range urls {
wg.Add(1)
go func(index int, targetURL string) {
defer wg.Done()
result := ScrapingResult{URL: targetURL}
resp, err := client.Get(targetURL)
if err != nil {
result.Error = err
results[index] = result
return
}
defer resp.Body.Close()
result.Status = resp.StatusCode
result.Protocol = resp.Proto
results[index] = result
}(i, url)
}
wg.Wait()
return results
}
func main() {
urls := []string{
"https://httpbin.org/get",
"https://httpbin.org/headers",
"https://httpbin.org/user-agent",
}
results := scrapeMultipleURLs(urls)
for _, result := range results {
if result.Error != nil {
fmt.Printf("Error for %s: %v\n", result.URL, result.Error)
} else {
fmt.Printf("URL: %s, Status: %d, Protocol: %s\n",
result.URL, result.Status, result.Protocol)
}
}
}
Advanced HTTP/2 Configuration
For production web scraping, you'll want more sophisticated configuration:
package main
import (
"context"
"crypto/tls"
"fmt"
"golang.org/x/net/http2"
"net"
"net/http"
"time"
)
type HTTP2Scraper struct {
client *http.Client
}
func NewHTTP2Scraper() *HTTP2Scraper {
// Custom dialer with connection pooling
dialer := &net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
}
// Base transport configuration
transport := &http.Transport{
DialContext: dialer.DialContext,
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: false,
NextProtos: []string{"h2", "http/1.1"},
},
}
// Enable HTTP/2
if err := http2.ConfigureTransport(transport); err != nil {
panic(fmt.Sprintf("Failed to configure HTTP/2: %v", err))
}
client := &http.Client{
Transport: transport,
Timeout: 60 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
// Limit redirects
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
return &HTTP2Scraper{client: client}
}
func (s *HTTP2Scraper) Scrape(ctx context.Context, url string, headers map[string]string) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
// Set custom headers
for key, value := range headers {
req.Header.Set(key, value)
}
// Set a realistic User-Agent
if req.Header.Get("User-Agent") == "" {
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; GoScraper/1.0)")
}
resp, err := s.client.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
return resp, nil
}
func main() {
scraper := NewHTTP2Scraper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
headers := map[string]string{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
}
resp, err := scraper.Scrape(ctx, "https://httpbin.org/get", headers)
if err != nil {
panic(err)
}
defer resp.Body.Close()
fmt.Printf("Protocol: %s, Status: %d\n", resp.Proto, resp.StatusCode)
}
Handling Server Push
HTTP/2 server push allows servers to send resources proactively. Here's how to handle pushed resources:
package main
import (
"fmt"
"golang.org/x/net/http2"
"net/http"
)
func handleServerPush() {
transport := &http2.Transport{
PushHandler: &http2.PushHandler{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Printf("Received server push for: %s\n", r.URL.String())
// Handle the pushed resource
}),
},
}
client := &http.Client{Transport: transport}
resp, err := client.Get("https://example.com")
if err != nil {
panic(err)
}
defer resp.Body.Close()
fmt.Printf("Main response: %s\n", resp.Status)
}
Error Handling and Fallback
Implement robust error handling with HTTP/1.1 fallback:
package main
import (
"crypto/tls"
"fmt"
"golang.org/x/net/http2"
"net/http"
"strings"
"time"
)
func createAdaptiveClient() *http.Client {
transport := &http.Transport{
TLSClientConfig: &tls.Config{
NextProtos: []string{"h2", "http/1.1"},
},
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
}
// Try to configure HTTP/2, fall back gracefully
if err := http2.ConfigureTransport(transport); err != nil {
fmt.Printf("Warning: HTTP/2 not available, using HTTP/1.1: %v\n", err)
}
return &http.Client{
Transport: transport,
Timeout: 30 * time.Second,
}
}
func scrapeWithFallback(url string) error {
client := createAdaptiveClient()
resp, err := client.Get(url)
if err != nil {
// Check if it's an HTTP/2 specific error
if strings.Contains(err.Error(), "http2") {
fmt.Println("HTTP/2 failed, retrying with HTTP/1.1")
// Create HTTP/1.1 only client
http1Client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
NextProtos: []string{"http/1.1"},
},
},
Timeout: 30 * time.Second,
}
resp, err = http1Client.Get(url)
if err != nil {
return fmt.Errorf("both HTTP/2 and HTTP/1.1 failed: %w", err)
}
} else {
return fmt.Errorf("request failed: %w", err)
}
}
defer resp.Body.Close()
fmt.Printf("Successfully connected using %s\n", resp.Proto)
return nil
}
Performance Monitoring
Monitor HTTP/2 performance and connection reuse:
package main
import (
"fmt"
"golang.org/x/net/http2"
"net/http"
"net/http/httptrace"
"time"
)
func monitorHTTP2Performance(url string) {
var connReused bool
trace := &httptrace.ClientTrace{
GotConn: func(connInfo httptrace.GotConnInfo) {
connReused = connInfo.Reused
fmt.Printf("Connection reused: %t\n", connReused)
},
Got100Continue: func() {
fmt.Println("Received 100 Continue")
},
}
client := &http.Client{
Transport: &http2.Transport{},
Timeout: 30 * time.Second,
}
req, _ := http.NewRequest("GET", url, nil)
req = req.WithContext(httptrace.WithClientTrace(req.Context(), trace))
start := time.Now()
resp, err := client.Do(req)
duration := time.Since(start)
if err != nil {
panic(err)
}
defer resp.Body.Close()
fmt.Printf("Request completed in %v using %s\n", duration, resp.Proto)
}
Best Practices for HTTP/2 Web Scraping
- Connection Reuse: HTTP/2 connections are expensive to establish but efficient to reuse
- Request Prioritization: Use stream priorities for critical requests
- Header Compression: Minimize header size to benefit from HPACK compression
- Error Handling: Always implement fallback to HTTP/1.1
- Resource Management: Properly close connections and manage timeouts
Common Issues and Solutions
TLS Configuration: Ensure proper TLS setup as HTTP/2 requires TLS in most implementations.
Connection Limits: Be aware that some servers limit HTTP/2 concurrent streams.
Debugging: Use tools like curl -v --http2
to test HTTP/2 connectivity.
When building more complex scraping workflows that require JavaScript execution, consider how modern headless browser solutions handle HTTP/2 automatically, providing seamless protocol negotiation.
Conclusion
HTTP/2 support in Go web scraping applications provides significant performance benefits through connection multiplexing, header compression, and efficient resource loading. By implementing proper HTTP/2 configuration with fallback mechanisms, you can build robust scrapers that work efficiently with modern websites while maintaining compatibility with older servers. Remember to monitor performance metrics and implement appropriate error handling to ensure reliable scraping operations across different server configurations.