Beautiful Soup provides multiple powerful methods to filter and find specific HTML elements. You can use tag names, attributes, CSS selectors, or custom functions to locate exactly the elements you need from parsed HTML.
Setting Up Beautiful Soup
First, install the required packages:
pip install beautifulsoup4 requests lxml
Create a soup object from HTML:
from bs4 import BeautifulSoup
import requests
# From a URL
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'html.parser')
# From HTML string
html = '<div class="content"><p>Hello World</p></div>'
soup = BeautifulSoup(html, 'html.parser')
Basic Filtering Methods
1. Find by Tag Name
# Find first occurrence
first_paragraph = soup.find('p')
# Find all occurrences
all_paragraphs = soup.find_all('p')
all_divs = soup.find_all('div')
# Find multiple tag types
headers = soup.find_all(['h1', 'h2', 'h3'])
2. Find by Attributes
# By class (note: class_ with underscore)
articles = soup.find_all('div', class_='article')
buttons = soup.find_all('button', class_='btn-primary')
# By ID
header = soup.find('div', id='main-header')
# By any attribute
images = soup.find_all('img', alt='product image')
links = soup.find_all('a', href='https://example.com')
# Multiple attributes
forms = soup.find_all('form', {'method': 'post', 'class': 'contact-form'})
3. Find by Multiple Classes
# Element with multiple classes
elements = soup.find_all('div', class_=['btn', 'btn-large'])
# Element with all specified classes (using CSS selector)
elements = soup.select('div.btn.btn-large')
CSS Selector Filtering
The select()
method uses CSS selectors for more advanced filtering:
# Class selector
nav_items = soup.select('.navigation-item')
# ID selector
content = soup.select('#main-content')
# Descendant selector
nav_links = soup.select('.navigation a')
# Direct child selector
menu_items = soup.select('.menu > li')
# Attribute selectors
external_links = soup.select('a[href^="http"]')
required_inputs = soup.select('input[required]')
# Pseudo-selectors
first_paragraph = soup.select('p:first-child')
even_rows = soup.select('tr:nth-child(even)')
Advanced Filtering with Functions
Create custom filter functions for complex conditions:
def has_class_and_text(tag):
"""Find tags with specific class and containing text"""
return (tag.name == 'div' and
tag.has_attr('class') and
'product' in tag['class'] and
tag.get_text().strip())
products = soup.find_all(has_class_and_text)
def is_external_link(tag):
"""Find external links"""
return (tag.name == 'a' and
tag.has_attr('href') and
tag['href'].startswith('http') and
'example.com' not in tag['href'])
external_links = soup.find_all(is_external_link)
Text-based Filtering
import re
# Find by exact text
specific_link = soup.find('a', string='Contact Us')
# Find by text pattern
phone_links = soup.find_all('a', string=re.compile(r'\d{3}-\d{3}-\d{4}'))
# Find elements containing specific text
def contains_price(tag):
return tag.name and '$' in tag.get_text()
price_elements = soup.find_all(contains_price)
Limiting and Controlling Results
# Limit number of results
first_three_paragraphs = soup.find_all('p', limit=3)
# Search within specific elements
content_div = soup.find('div', id='content')
content_links = content_div.find_all('a') if content_div else []
# Recursive vs non-recursive search
direct_children = soup.find_all('div', recursive=False)
Practical Examples
Example 1: Scraping Product Information
# HTML structure:
# <div class="product-card">
# <h3 class="product-title">Product Name</h3>
# <span class="price">$29.99</span>
# <div class="rating" data-rating="4.5">★★★★☆</div>
# </div>
products = soup.find_all('div', class_='product-card')
for product in products:
title = product.find('h3', class_='product-title')
price = product.find('span', class_='price')
rating = product.find('div', class_='rating')
if title and price:
print(f"Product: {title.get_text().strip()}")
print(f"Price: {price.get_text().strip()}")
if rating and rating.has_attr('data-rating'):
print(f"Rating: {rating['data-rating']}")
print("-" * 30)
Example 2: Extracting Form Fields
# Find all input fields in a form
form = soup.find('form', id='contact-form')
if form:
# Text inputs
text_inputs = form.find_all('input', type='text')
# Required fields
required_fields = form.find_all(['input', 'select', 'textarea'], required=True)
# All form controls
form_controls = form.select('input, select, textarea')
for field in required_fields:
field_name = field.get('name', 'unnamed')
field_type = field.name
print(f"Required {field_type}: {field_name}")
Example 3: Finding Nested Elements
# Find articles with specific structure
def is_complete_article(tag):
if tag.name != 'article':
return False
# Must have title, content, and author
has_title = tag.find('h2') or tag.find('h3')
has_content = tag.find('div', class_='content') or tag.find('p')
has_author = tag.find(class_='author')
return has_title and has_content and has_author
complete_articles = soup.find_all(is_complete_article)
Performance Tips
# Use find() instead of find_all() when you only need the first match
first_match = soup.find('div', class_='target')
# Use CSS selectors for complex queries
complex_elements = soup.select('div.container > .row .col-md-6:first-child')
# Limit search scope when possible
content_section = soup.find('section', id='content')
if content_section:
paragraphs = content_section.find_all('p') # Faster than searching entire document
Error Handling
def safe_extract_text(element, selector):
"""Safely extract text from nested elements"""
try:
found = element.find(selector) if hasattr(element, 'find') else soup.find(selector)
return found.get_text().strip() if found else None
except AttributeError:
return None
# Usage
for product in soup.find_all('div', class_='product'):
title = safe_extract_text(product, 'h3')
price = safe_extract_text(product, '.price')
if title and price:
print(f"{title}: {price}")
Beautiful Soup's filtering capabilities allow you to precisely target the HTML elements you need for web scraping. Combine different methods and use custom functions for complex filtering requirements.