Beautiful Soup provides powerful methods to update and replace content within HTML/XML parse trees. You can modify text, attributes, entire tags, and manipulate the document structure dynamically.
Modifying Text Content
Replacing Simple Text
Use the .string
attribute to replace text in elements with a single text child:
from bs4 import BeautifulSoup
html_doc = '<p id="my_paragraph">Old text</p>'
soup = BeautifulSoup(html_doc, 'html.parser')
# Find and replace text
p_tag = soup.find('p', id='my_paragraph')
p_tag.string = 'New text'
print(soup.prettify())
# Output: <p id="my_paragraph">New text</p>
Replacing Complex Content
For elements with multiple children, use .clear()
and .append()
:
from bs4 import BeautifulSoup, NavigableString
html_doc = '<div><p>Keep this</p><span>Remove this</span></div>'
soup = BeautifulSoup(html_doc, 'html.parser')
div_tag = soup.find('div')
# Clear all content and add new
div_tag.clear()
div_tag.append(NavigableString('New content'))
print(soup)
# Output: <div>New content</div>
Using .get_text() and .string Safely
from bs4 import BeautifulSoup
html_doc = '<p>Text with <strong>bold</strong> content</p>'
soup = BeautifulSoup(html_doc, 'html.parser')
p_tag = soup.find('p')
# Get all text content
all_text = p_tag.get_text()
print(f"Full text: {all_text}") # "Text with bold content"
# Replace only if single string child
if p_tag.string:
p_tag.string = 'New text'
else:
# Handle mixed content differently
p_tag.clear()
p_tag.string = 'New text'
Managing Attributes
Adding and Modifying Attributes
Treat tags like dictionaries to manipulate attributes:
from bs4 import BeautifulSoup
html_doc = '<img src="old.jpg" alt="Old image">'
soup = BeautifulSoup(html_doc, 'html.parser')
img_tag = soup.find('img')
# Modify existing attributes
img_tag['src'] = 'new.jpg'
img_tag['alt'] = 'New image'
# Add new attributes
img_tag['class'] = ['responsive', 'centered']
img_tag['data-lazy'] = 'true'
print(soup)
# Output: <img alt="New image" class="responsive centered" data-lazy="true" src="new.jpg"/>
Working with Multi-Value Attributes
Some attributes like class
can have multiple values:
from bs4 import BeautifulSoup
html_doc = '<div class="container fluid"></div>'
soup = BeautifulSoup(html_doc, 'html.parser')
div_tag = soup.find('div')
# Access as list
classes = div_tag.get('class', [])
print(classes) # ['container', 'fluid']
# Modify class list
classes.append('dark-mode')
div_tag['class'] = classes
# Or set directly
div_tag['class'] = ['new-class', 'another-class']
Removing Attributes
from bs4 import BeautifulSoup
html_doc = '<p id="temp" class="old-style" data-temp="remove">Content</p>'
soup = BeautifulSoup(html_doc, 'html.parser')
p_tag = soup.find('p')
# Remove single attribute
del p_tag['class']
# Remove multiple attributes
for attr in ['id', 'data-temp']:
if attr in p_tag.attrs:
del p_tag[attr]
print(soup)
# Output: <p>Content</p>
Replacing and Manipulating Tags
Complete Tag Replacement
Use .replace_with()
to swap entire elements:
from bs4 import BeautifulSoup
html_doc = '''
<div>
<p class="old">Old paragraph</p>
<span>Keep this</span>
</div>
'''
soup = BeautifulSoup(html_doc, 'html.parser')
old_p = soup.find('p', class_='old')
# Create new tag with attributes
new_tag = soup.new_tag('article', **{'class': 'new', 'data-type': 'content'})
new_tag.string = 'New article content'
# Replace old with new
old_p.replace_with(new_tag)
print(soup.prettify())
Creating Complex New Tags
from bs4 import BeautifulSoup
soup = BeautifulSoup('<div></div>', 'html.parser')
# Create nested structure
article = soup.new_tag('article')
header = soup.new_tag('header')
title = soup.new_tag('h1')
title.string = 'Article Title'
header.append(title)
article.append(header)
# Add content
content = soup.new_tag('div', **{'class': 'content'})
content.string = 'Article content here'
article.append(content)
soup.div.replace_with(article)
print(soup.prettify())
Advanced Content Manipulation
Inserting Content at Specific Positions
from bs4 import BeautifulSoup
html_doc = '<ul><li>Item 1</li><li>Item 3</li></ul>'
soup = BeautifulSoup(html_doc, 'html.parser')
ul_tag = soup.find('ul')
# Insert at specific position
new_li = soup.new_tag('li')
new_li.string = 'Item 2'
ul_tag.insert(1, new_li) # Insert at index 1
# Insert at beginning
first_li = soup.new_tag('li')
first_li.string = 'Item 0'
ul_tag.insert(0, first_li)
print(soup.prettify())
Wrapping and Unwrapping Elements
from bs4 import BeautifulSoup
html_doc = '<p>Text to wrap</p>'
soup = BeautifulSoup(html_doc, 'html.parser')
p_tag = soup.find('p')
# Wrap with new tag
wrapper = soup.new_tag('div', **{'class': 'wrapper'})
p_tag.wrap(wrapper)
print(soup)
# Output: <div class="wrapper"><p>Text to wrap</p></div>
# Unwrap (remove wrapper, keep content)
wrapper.unwrap()
print(soup)
# Output: <p>Text to wrap</p>
Removing Content
Different Removal Methods
from bs4 import BeautifulSoup
html_doc = '''
<div>
<p class="remove">Remove completely</p>
<p class="extract">Extract this</p>
<p class="clear">Clear content</p>
</div>
'''
soup = BeautifulSoup(html_doc, 'html.parser')
# decompose() - completely destroys element
soup.find('p', class_='remove').decompose()
# extract() - removes but keeps in memory
extracted = soup.find('p', class_='extract').extract()
print(f"Extracted: {extracted}")
# clear() - removes content but keeps tag
soup.find('p', class_='clear').clear()
print(soup.prettify())
Practical Examples
Updating Links in HTML
from bs4 import BeautifulSoup
import re
html_doc = '''
<div>
<a href="http://old-domain.com/page1">Link 1</a>
<a href="http://old-domain.com/page2">Link 2</a>
<a href="http://other-site.com">External</a>
</div>
'''
soup = BeautifulSoup(html_doc, 'html.parser')
# Update specific domain links
for link in soup.find_all('a', href=re.compile(r'old-domain\.com')):
old_href = link['href']
new_href = old_href.replace('old-domain.com', 'new-domain.com')
link['href'] = new_href
# Add migration indicator
link['data-migrated'] = 'true'
print(soup.prettify())
Converting Table Structure
from bs4 import BeautifulSoup
html_doc = '''
<table>
<tr><td>Name</td><td>Age</td></tr>
<tr><td>John</td><td>30</td></tr>
</table>
'''
soup = BeautifulSoup(html_doc, 'html.parser')
table = soup.find('table')
# Convert to div-based layout
div_container = soup.new_tag('div', **{'class': 'table-container'})
for row in table.find_all('tr'):
row_div = soup.new_tag('div', **{'class': 'table-row'})
for cell in row.find_all('td'):
cell_div = soup.new_tag('div', **{'class': 'table-cell'})
cell_div.string = cell.get_text()
row_div.append(cell_div)
div_container.append(row_div)
table.replace_with(div_container)
print(soup.prettify())
Best Practices
- Always check if elements exist before modifying:
element = soup.find('target')
if element:
element.string = 'New content'
Use appropriate removal methods:
.decompose()
for permanent removal.extract()
when you might reuse the element.clear()
to empty content while keeping the tag
Handle encoding properly:
# Convert back to string when needed
modified_html = str(soup)
# Or with specific encoding
modified_html = soup.encode('utf-8')
- Preserve document structure when making bulk changes by working with copies when necessary.
Beautiful Soup's modification capabilities make it an excellent choice for HTML/XML document transformation and content management tasks.