How do I Handle robots.txt Files in Go Scraping?
Handling robots.txt files is a crucial aspect of ethical web scraping. The robots.txt file serves as a protocol that websites use to communicate with web crawlers and scrapers about which parts of their site should or shouldn't be accessed. In Go, there are several approaches to parse and respect these directives, ensuring your scraping activities remain compliant and respectful.
Understanding robots.txt Format
Before diving into implementation, it's important to understand the robots.txt format. This file is typically located at the root of a website (e.g., https://example.com/robots.txt
) and contains directives like:
User-agent: *
Disallow: /private/
Disallow: /admin/
Allow: /public/
Crawl-delay: 1
User-agent: GoogleBot
Disallow: /temp/
Basic robots.txt Parser Implementation
Here's a simple Go implementation to fetch and parse robots.txt files:
package main
import (
"bufio"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
type RobotsRule struct {
UserAgent string
Disallowed []string
Allowed []string
CrawlDelay time.Duration
Sitemap []string
}
type RobotsParser struct {
Rules []RobotsRule
BaseURL *url.URL
}
func NewRobotsParser(baseURL string) (*RobotsParser, error) {
u, err := url.Parse(baseURL)
if err != nil {
return nil, err
}
return &RobotsParser{
BaseURL: u,
Rules: make([]RobotsRule, 0),
}, nil
}
func (rp *RobotsParser) Fetch() error {
robotsURL := rp.BaseURL.Scheme + "://" + rp.BaseURL.Host + "/robots.txt"
resp, err := http.Get(robotsURL)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
// If robots.txt doesn't exist, assume everything is allowed
return nil
}
return rp.Parse(resp.Body)
}
func (rp *RobotsParser) Parse(body io.Reader) error {
scanner := bufio.NewScanner(body)
var currentRule *RobotsRule
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Skip comments and empty lines
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
directive := strings.ToLower(strings.TrimSpace(parts[0]))
value := strings.TrimSpace(parts[1])
switch directive {
case "user-agent":
if currentRule != nil {
rp.Rules = append(rp.Rules, *currentRule)
}
currentRule = &RobotsRule{
UserAgent: value,
Disallowed: make([]string, 0),
Allowed: make([]string, 0),
Sitemap: make([]string, 0),
}
case "disallow":
if currentRule != nil && value != "" {
currentRule.Disallowed = append(currentRule.Disallowed, value)
}
case "allow":
if currentRule != nil && value != "" {
currentRule.Allowed = append(currentRule.Allowed, value)
}
case "crawl-delay":
if currentRule != nil {
if delay, err := strconv.Atoi(value); err == nil {
currentRule.CrawlDelay = time.Duration(delay) * time.Second
}
}
case "sitemap":
if currentRule != nil {
currentRule.Sitemap = append(currentRule.Sitemap, value)
}
}
}
// Add the last rule
if currentRule != nil {
rp.Rules = append(rp.Rules, *currentRule)
}
return scanner.Err()
}
Checking URL Permissions
Once you've parsed the robots.txt file, you need to check if a specific URL is allowed to be scraped:
func (rp *RobotsParser) IsAllowed(userAgent, urlPath string) bool {
// Find the most specific rule for the user agent
var applicableRule *RobotsRule
for _, rule := range rp.Rules {
if rule.UserAgent == "*" && applicableRule == nil {
applicableRule = &rule
} else if strings.ToLower(rule.UserAgent) == strings.ToLower(userAgent) {
applicableRule = &rule
break
}
}
if applicableRule == nil {
return true // No rules found, assume allowed
}
// Check if explicitly allowed
for _, allowed := range applicableRule.Allowed {
if matchesPattern(allowed, urlPath) {
return true
}
}
// Check if disallowed
for _, disallowed := range applicableRule.Disallowed {
if matchesPattern(disallowed, urlPath) {
return false
}
}
return true // Not explicitly disallowed
}
func matchesPattern(pattern, path string) bool {
// Handle wildcard matching
if pattern == "/" {
return true
}
// Convert robots.txt pattern to regex
pattern = regexp.QuoteMeta(pattern)
pattern = strings.ReplaceAll(pattern, "\\*", ".*")
pattern = "^" + pattern
matched, _ := regexp.MatchString(pattern, path)
return matched
}
func (rp *RobotsParser) GetCrawlDelay(userAgent string) time.Duration {
for _, rule := range rp.Rules {
if strings.ToLower(rule.UserAgent) == strings.ToLower(userAgent) ||
rule.UserAgent == "*" {
return rule.CrawlDelay
}
}
return 0
}
Using Third-Party Libraries
For more robust robots.txt handling, consider using existing Go libraries:
Using the robotstxt Library
go get github.com/temoto/robotstxt
package main
import (
"fmt"
"net/http"
"net/url"
"github.com/temoto/robotstxt"
)
func main() {
// Fetch and parse robots.txt
resp, err := http.Get("https://example.com/robots.txt")
if err != nil {
panic(err)
}
defer resp.Body.Close()
robots, err := robotstxt.FromResponse(resp)
if err != nil {
panic(err)
}
// Check if a URL is allowed
testURL, _ := url.Parse("https://example.com/some/path")
allowed := robots.TestAgent(testURL.Path, "MyBot/1.0")
fmt.Printf("URL %s is allowed: %t\n", testURL.Path, allowed)
// Get crawl delay
group := robots.FindGroup("MyBot/1.0")
if group != nil {
fmt.Printf("Crawl delay: %v\n", group.CrawlDelay)
}
}
Integrating with Web Scraping Logic
Here's how to integrate robots.txt checking into your web scraping workflow:
package main
import (
"fmt"
"net/http"
"net/url"
"time"
"github.com/temoto/robotstxt"
)
type EthicalScraper struct {
client *http.Client
userAgent string
robots *robotstxt.RobotsData
baseURL *url.URL
}
func NewEthicalScraper(baseURL, userAgent string) (*EthicalScraper, error) {
u, err := url.Parse(baseURL)
if err != nil {
return nil, err
}
scraper := &EthicalScraper{
client: &http.Client{Timeout: 30 * time.Second},
userAgent: userAgent,
baseURL: u,
}
// Fetch and parse robots.txt
if err := scraper.loadRobots(); err != nil {
fmt.Printf("Warning: Could not load robots.txt: %v\n", err)
}
return scraper, nil
}
func (es *EthicalScraper) loadRobots() error {
robotsURL := es.baseURL.Scheme + "://" + es.baseURL.Host + "/robots.txt"
resp, err := es.client.Get(robotsURL)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode == 200 {
robots, err := robotstxt.FromResponse(resp)
if err != nil {
return err
}
es.robots = robots
}
return nil
}
func (es *EthicalScraper) CanFetch(urlPath string) bool {
if es.robots == nil {
return true // No robots.txt found, assume allowed
}
return es.robots.TestAgent(urlPath, es.userAgent)
}
func (es *EthicalScraper) GetCrawlDelay() time.Duration {
if es.robots == nil {
return time.Second // Default delay
}
group := es.robots.FindGroup(es.userAgent)
if group != nil && group.CrawlDelay > 0 {
return group.CrawlDelay
}
return time.Second // Default delay
}
func (es *EthicalScraper) Fetch(urlPath string) (*http.Response, error) {
if !es.CanFetch(urlPath) {
return nil, fmt.Errorf("robots.txt disallows fetching %s", urlPath)
}
// Respect crawl delay
time.Sleep(es.GetCrawlDelay())
fullURL := es.baseURL.Scheme + "://" + es.baseURL.Host + urlPath
req, err := http.NewRequest("GET", fullURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", es.userAgent)
return es.client.Do(req)
}
Best Practices for robots.txt Handling
1. Cache robots.txt Data
type RobotsCache struct {
cache map[string]*robotstxt.RobotsData
lastFetch map[string]time.Time
mutex sync.RWMutex
}
func (rc *RobotsCache) GetRobots(domain string) (*robotstxt.RobotsData, error) {
rc.mutex.RLock()
robots, exists := rc.cache[domain]
lastFetch := rc.lastFetch[domain]
rc.mutex.RUnlock()
// Refresh cache if it's older than 24 hours
if !exists || time.Since(lastFetch) > 24*time.Hour {
return rc.fetchAndCache(domain)
}
return robots, nil
}
2. Handle Different User Agents
func (es *EthicalScraper) SetUserAgent(userAgent string) {
es.userAgent = userAgent
// Reload robots.txt for new user agent if needed
es.loadRobots()
}
3. Graceful Error Handling
func (es *EthicalScraper) loadRobots() error {
robotsURL := es.baseURL.Scheme + "://" + es.baseURL.Host + "/robots.txt"
resp, err := es.client.Get(robotsURL)
if err != nil {
// Network error - log and continue with permissive policy
fmt.Printf("Warning: Network error fetching robots.txt: %v\n", err)
return nil
}
defer resp.Body.Close()
switch resp.StatusCode {
case 200:
robots, err := robotstxt.FromResponse(resp)
if err != nil {
fmt.Printf("Warning: Error parsing robots.txt: %v\n", err)
return nil
}
es.robots = robots
case 404, 403:
// No robots.txt or forbidden - assume permissive policy
fmt.Println("No robots.txt found, proceeding with default policy")
default:
fmt.Printf("Unexpected status code %d for robots.txt\n", resp.StatusCode)
}
return nil
}
Testing Your Implementation
func TestRobotsCompliance(t *testing.T) {
scraper, err := NewEthicalScraper("https://example.com", "TestBot/1.0")
assert.NoError(t, err)
// Test allowed paths
assert.True(t, scraper.CanFetch("/public/page"))
// Test disallowed paths
assert.False(t, scraper.CanFetch("/admin/"))
assert.False(t, scraper.CanFetch("/private/data"))
// Test crawl delay
delay := scraper.GetCrawlDelay()
assert.True(t, delay > 0)
}
Conclusion
Properly handling robots.txt files in Go scraping applications is essential for ethical web scraping. By implementing robust parsing logic, respecting crawl delays, and handling edge cases gracefully, you can ensure your scrapers remain compliant with website policies. Whether you choose to implement your own parser or use existing libraries like robotstxt
, the key is to make robots.txt checking an integral part of your scraping workflow.
Remember that respecting robots.txt is not just about legal compliance—it's about being a good citizen of the web and maintaining sustainable scraping practices that don't burden target servers unnecessarily.