Beautiful Soup is inherently synchronous and doesn't support asynchronous operations natively. However, you can effectively combine it with asyncio by using asynchronous HTTP clients like aiohttp
for network requests while keeping Beautiful Soup for HTML parsing.
Basic Setup
First, install the required packages:
pip install beautifulsoup4 aiohttp
Simple Async Scraping Example
Here's a basic example combining aiohttp with Beautiful Soup:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
async def fetch_and_parse(session, url):
"""Fetch a URL and parse it with Beautiful Soup"""
try:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Extract data - example: all links
links = [a.get('href') for a in soup.find_all('a', href=True)]
return {'url': url, 'links': links, 'title': soup.title.string if soup.title else None}
except Exception as e:
return {'url': url, 'error': str(e)}
async def main():
urls = [
'https://example.com',
'https://httpbin.org/html',
'https://httpbin.org/json'
]
async with aiohttp.ClientSession() as session:
tasks = [fetch_and_parse(session, url) for url in urls]
results = await asyncio.gather(*tasks)
for result in results:
print(f"URL: {result['url']}")
if 'error' in result:
print(f"Error: {result['error']}")
else:
print(f"Title: {result.get('title', 'No title')}")
print(f"Links found: {len(result['links'])}")
print("-" * 50)
if __name__ == "__main__":
asyncio.run(main())
Concurrent Scraping with Rate Limiting
For production scraping, you'll want to implement rate limiting and error handling:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import time
from asyncio import Semaphore
class AsyncScraper:
def __init__(self, max_concurrent=10, delay=1):
self.semaphore = Semaphore(max_concurrent)
self.delay = delay
async def fetch_with_rate_limit(self, session, url):
"""Fetch URL with rate limiting"""
async with self.semaphore:
try:
await asyncio.sleep(self.delay)
async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as response:
if response.status == 200:
return await response.text()
else:
raise aiohttp.ClientError(f"HTTP {response.status}")
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def parse_html(self, html, url):
"""Parse HTML with Beautiful Soup"""
if not html:
return None
soup = BeautifulSoup(html, 'html.parser')
# Extract various data points
data = {
'url': url,
'title': soup.title.string.strip() if soup.title else None,
'meta_description': None,
'headings': [],
'links': [],
'images': []
}
# Meta description
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc:
data['meta_description'] = meta_desc.get('content')
# Headings
for i in range(1, 7):
headings = soup.find_all(f'h{i}')
data['headings'].extend([h.get_text().strip() for h in headings])
# Links
links = soup.find_all('a', href=True)
data['links'] = [link.get('href') for link in links]
# Images
images = soup.find_all('img', src=True)
data['images'] = [img.get('src') for img in images]
return data
async def scrape_urls(self, urls):
"""Scrape multiple URLs concurrently"""
results = []
connector = aiohttp.TCPConnector(limit=100, limit_per_host=10)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = []
for url in urls:
task = self.fetch_and_parse_url(session, url)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r for r in results if not isinstance(r, Exception)]
async def fetch_and_parse_url(self, session, url):
"""Fetch and parse a single URL"""
html = await self.fetch_with_rate_limit(session, url)
return self.parse_html(html, url)
# Usage
async def main():
scraper = AsyncScraper(max_concurrent=5, delay=0.5)
urls = [
'https://example.com',
'https://httpbin.org/html',
'https://python.org',
'https://docs.python.org/3/',
]
results = await scraper.scrape_urls(urls)
for result in results:
if result:
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Headings: {len(result['headings'])}")
print(f"Links: {len(result['links'])}")
print("-" * 50)
if __name__ == "__main__":
asyncio.run(main())
Using ThreadPoolExecutor for CPU-Intensive Parsing
When dealing with large HTML documents, you can offload Beautiful Soup parsing to a thread pool:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import functools
def parse_html_sync(html, url):
"""Synchronous HTML parsing function"""
soup = BeautifulSoup(html, 'html.parser')
# CPU-intensive parsing operations
data = {
'url': url,
'title': soup.title.string if soup.title else None,
'text_content': soup.get_text()[:1000], # First 1000 chars
'link_count': len(soup.find_all('a')),
'image_count': len(soup.find_all('img')),
'word_count': len(soup.get_text().split())
}
return data
async def fetch_and_parse_threaded(session, url, executor):
"""Fetch URL and parse in thread pool"""
try:
async with session.get(url) as response:
html = await response.text()
# Run parsing in thread pool to avoid blocking event loop
loop = asyncio.get_event_loop()
parse_func = functools.partial(parse_html_sync, html, url)
result = await loop.run_in_executor(executor, parse_func)
return result
except Exception as e:
return {'url': url, 'error': str(e)}
async def main():
urls = [
'https://en.wikipedia.org/wiki/Python_(programming_language)',
'https://docs.python.org/3/library/asyncio.html',
'https://realpython.com/python-web-scraping-practical-introduction/',
]
# Create thread pool for CPU-intensive tasks
with ThreadPoolExecutor(max_workers=4) as executor:
async with aiohttp.ClientSession() as session:
tasks = [
fetch_and_parse_threaded(session, url, executor)
for url in urls
]
results = await asyncio.gather(*tasks)
for result in results:
if 'error' not in result:
print(f"URL: {result['url']}")
print(f"Title: {result['title']}")
print(f"Word count: {result['word_count']}")
print(f"Links: {result['link_count']}")
print("-" * 50)
if __name__ == "__main__":
asyncio.run(main())
Best Practices
- Error Handling: Always wrap network requests in try-except blocks
- Rate Limiting: Use semaphores to control concurrent requests
- Timeouts: Set appropriate timeouts for HTTP requests
- Session Reuse: Use a single aiohttp session for multiple requests
- Resource Cleanup: Use context managers for proper resource management
- Thread Pool: For CPU-intensive parsing, use
run_in_executor
Key Considerations
- Beautiful Soup remains synchronous: Only the HTTP requests are asynchronous
- Network I/O is the bottleneck: Parsing is usually fast compared to network requests
- Memory usage: Be mindful of memory when processing many large documents
- Respect robots.txt: Always check website scraping policies
- Rate limiting: Implement delays to avoid overwhelming target servers
This approach allows you to efficiently scrape multiple websites concurrently while leveraging Beautiful Soup's powerful HTML parsing capabilities.