Beautiful Soup provides multiple ways to access HTML element attributes in Python. Elements behave like dictionaries, making attribute access intuitive and flexible.
Quick Example
from bs4 import BeautifulSoup
html = '<a href="https://example.com" id="main-link" target="_blank">Click here</a>'
soup = BeautifulSoup(html, 'html.parser')
link = soup.find('a')
# Three ways to access attributes
href = link['href'] # Dictionary-style
target = link.get('target') # Safe method
all_attrs = link.attrs # All attributes
Methods for Accessing Attributes
1. Dictionary-Style Access
Access attributes directly using square brackets:
from bs4 import BeautifulSoup
html = '''
<img src="image.jpg" alt="Description" width="300" height="200">
<form action="/submit" method="POST" enctype="multipart/form-data">
'''
soup = BeautifulSoup(html, 'html.parser')
# Access image attributes
img = soup.find('img')
src = img['src'] # "image.jpg"
alt = img['alt'] # "Description"
width = img['width'] # "300"
# Access form attributes
form = soup.find('form')
action = form['action'] # "/submit"
method = form['method'] # "POST"
2. Safe Access with .get() Method
Use .get() to avoid KeyError exceptions:
# Safe attribute access
img = soup.find('img')
# Returns attribute value or None if not found
title = img.get('title') # None (doesn't exist)
src = img.get('src') # "image.jpg"
# Provide default value
title = img.get('title', 'No title') # "No title"
3. Access All Attributes with .attrs
Get all attributes as a dictionary:
img = soup.find('img')
all_attributes = img.attrs
print(all_attributes)
# Output: {'src': 'image.jpg', 'alt': 'Description', 'width': '300', 'height': '200'}
# Iterate through all attributes
for attr_name, attr_value in img.attrs.items():
print(f"{attr_name}: {attr_value}")
Working with Complex Attributes
Multi-Value Attributes (like class)
Some attributes can have multiple values:
html = '<div class="main content highlighted" data-tags="python web-scraping">'
soup = BeautifulSoup(html, 'html.parser')
div = soup.find('div')
# Class attribute returns a list
classes = div['class']
print(classes) # ['main', 'content', 'highlighted']
# Join into string if needed
class_string = ' '.join(div['class'])
print(class_string) # "main content highlighted"
# Data attributes return as strings
tags = div['data-tags']
print(tags) # "python web-scraping"
Boolean Attributes
HTML boolean attributes (like disabled, checked) are handled specially:
html = '''
<input type="checkbox" checked>
<button disabled>Click me</button>
<input type="text" required readonly>
'''
soup = BeautifulSoup(html, 'html.parser')
checkbox = soup.find('input', {'type': 'checkbox'})
button = soup.find('button')
text_input = soup.find('input', {'type': 'text'})
# Boolean attributes return empty string when present
print(checkbox.get('checked')) # ""
print(button.get('disabled')) # ""
print(text_input.get('required')) # ""
print(text_input.get('readonly')) # ""
# Check if boolean attribute exists
has_checked = 'checked' in checkbox.attrs # True
has_disabled = 'disabled' in button.attrs # True
Error Handling
KeyError Prevention
from bs4 import BeautifulSoup
html = '<p>Simple paragraph</p>'
soup = BeautifulSoup(html, 'html.parser')
p = soup.find('p')
# This will raise KeyError
try:
title = p['title']
except KeyError:
print("Attribute 'title' not found")
# This is safe
title = p.get('title') # Returns None
if title:
print(f"Title: {title}")
else:
print("No title attribute")
Practical Examples
Extracting Links and Their Properties
html = '''
<a href="/internal-link" class="nav-link">Home</a>
<a href="https://external.com" target="_blank" rel="nofollow">External</a>
<a href="mailto:contact@example.com">Email</a>
'''
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href', 'No href')
target = link.get('target', 'Same window')
rel = link.get('rel', 'No rel')
text = link.get_text()
print(f"Text: {text}")
print(f"URL: {href}")
print(f"Target: {target}")
print(f"Rel: {rel}")
print("---")
Processing Form Elements
html = '''
<form id="user-form" method="POST" action="/users">
<input type="text" name="username" placeholder="Enter username" required>
<input type="email" name="email" value="user@example.com">
<select name="country" multiple>
<option value="us" selected>United States</option>
<option value="ca">Canada</option>
</select>
</form>
'''
soup = BeautifulSoup(html, 'html.parser')
# Form attributes
form = soup.find('form')
print(f"Form ID: {form.get('id')}")
print(f"Method: {form.get('method')}")
print(f"Action: {form.get('action')}")
# Input attributes
for input_elem in form.find_all('input'):
name = input_elem.get('name')
input_type = input_elem.get('type')
value = input_elem.get('value', 'No default value')
placeholder = input_elem.get('placeholder', 'No placeholder')
required = 'required' in input_elem.attrs
print(f"{name} ({input_type}): {value}")
print(f" Placeholder: {placeholder}")
print(f" Required: {required}")
Best Practices
- Use
.get()for optional attributes to avoid KeyError exceptions - Check attribute existence before processing:
if 'href' in element.attrs: - Handle multi-value attributes appropriately (like
class) - Provide default values when using
.get()method - Use
.attrswhen you need all attributes for processing
These methods give you complete control over HTML attribute access in Beautiful Soup, making your web scraping code more robust and flexible.