Total URLs: {self.total_urls}
Successful: {self.successful}
Failed: {self.failed}
Duration: {duration:.2f}s
Rate: {rate:.2f} pages/second
""")
Usage
monitor = ScrapingMonitor() monitor.total_urls = len(urls)
for url in urls: try: result = scrape_with_retry(url) monitor.log_success(url) except Exception as e: monitor.log_failure(url, str(e))
monitor.print_summary() ```
Cost Optimization Strategies
When scraping at scale, API costs can add up. Here are strategies to optimize:
- Use the Right Format: Request only the formats you need (
markdown
,html
, orlinks
) - Filter Content: Use
onlyMainContent: true
to reduce token usage for AI features - Smart Crawling: Use
includes
andexcludes
patterns to avoid unnecessary pages - Cache Results: Store and reuse results when possible to avoid re-scraping
import hashlib
import os
import json
class ResultCache:
def __init__(self, cache_dir='cache'):
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
def get_cache_key(self, url):
return hashlib.md5(url.encode()).hexdigest()
def get(self, url):
cache_file = os.path.join(self.cache_dir, self.get_cache_key(url) + '.json')
if os.path.exists(cache_file):
with open(cache_file, 'r') as f:
return json.load(f)
return None
def set(self, url, result):
cache_file = os.path.join(self.cache_dir, self.get_cache_key(url) + '.json')
with open(cache_file, 'w') as f:
json.dump(result, f)
# Usage with cache
cache = ResultCache()
def scrape_with_cache(url):
cached = cache.get(url)
if cached:
logging.info(f"Cache hit for {url}")
return cached
result = firecrawl.scrape_url(url)
cache.set(url, result)
return result
Conclusion
Firecrawl is well-suited for large-scale web scraping operations, offering features like batch crawling, asynchronous processing, and robust error handling. By implementing proper concurrency control, error handling, monitoring, and caching strategies, you can efficiently scrape thousands of pages while maintaining reliability and controlling costs. Whether you're using the hosted Firecrawl API or deploying your own instance, following these best practices will help you build production-ready scraping pipelines that scale effectively.
For handling complex scenarios like running multiple pages in parallel or processing dynamic content, Firecrawl's architecture provides the flexibility and performance needed for enterprise-grade data extraction projects.