How do I update or replace content in a Beautiful Soup parse tree?

Beautiful Soup provides powerful methods to update and replace content within HTML/XML parse trees. You can modify text, attributes, entire tags, and manipulate the document structure dynamically.

Modifying Text Content

Replacing Simple Text

Use the .string attribute to replace text in elements with a single text child:

from bs4 import BeautifulSoup

html_doc = '<p id="my_paragraph">Old text</p>'
soup = BeautifulSoup(html_doc, 'html.parser')

# Find and replace text
p_tag = soup.find('p', id='my_paragraph')
p_tag.string = 'New text'

print(soup.prettify())
# Output: <p id="my_paragraph">New text</p>

Replacing Complex Content

For elements with multiple children, use .clear() and .append():

from bs4 import BeautifulSoup, NavigableString

html_doc = '<div><p>Keep this</p><span>Remove this</span></div>'
soup = BeautifulSoup(html_doc, 'html.parser')

div_tag = soup.find('div')
# Clear all content and add new
div_tag.clear()
div_tag.append(NavigableString('New content'))

print(soup)
# Output: <div>New content</div>

Using .get_text() and .string Safely

from bs4 import BeautifulSoup

html_doc = '<p>Text with <strong>bold</strong> content</p>'
soup = BeautifulSoup(html_doc, 'html.parser')

p_tag = soup.find('p')
# Get all text content
all_text = p_tag.get_text()
print(f"Full text: {all_text}")  # "Text with bold content"

# Replace only if single string child
if p_tag.string:
    p_tag.string = 'New text'
else:
    # Handle mixed content differently
    p_tag.clear()
    p_tag.string = 'New text'

Managing Attributes

Adding and Modifying Attributes

Treat tags like dictionaries to manipulate attributes:

from bs4 import BeautifulSoup

html_doc = '<img src="old.jpg" alt="Old image">'
soup = BeautifulSoup(html_doc, 'html.parser')

img_tag = soup.find('img')

# Modify existing attributes
img_tag['src'] = 'new.jpg'
img_tag['alt'] = 'New image'

# Add new attributes
img_tag['class'] = ['responsive', 'centered']
img_tag['data-lazy'] = 'true'

print(soup)
# Output: <img alt="New image" class="responsive centered" data-lazy="true" src="new.jpg"/>

Working with Multi-Value Attributes

Some attributes like class can have multiple values:

from bs4 import BeautifulSoup

html_doc = '<div class="container fluid"></div>'
soup = BeautifulSoup(html_doc, 'html.parser')

div_tag = soup.find('div')

# Access as list
classes = div_tag.get('class', [])
print(classes)  # ['container', 'fluid']

# Modify class list
classes.append('dark-mode')
div_tag['class'] = classes

# Or set directly
div_tag['class'] = ['new-class', 'another-class']

Removing Attributes

from bs4 import BeautifulSoup

html_doc = '<p id="temp" class="old-style" data-temp="remove">Content</p>'
soup = BeautifulSoup(html_doc, 'html.parser')

p_tag = soup.find('p')

# Remove single attribute
del p_tag['class']

# Remove multiple attributes
for attr in ['id', 'data-temp']:
    if attr in p_tag.attrs:
        del p_tag[attr]

print(soup)
# Output: <p>Content</p>

Replacing and Manipulating Tags

Complete Tag Replacement

Use .replace_with() to swap entire elements:

from bs4 import BeautifulSoup

html_doc = '''
<div>
    <p class="old">Old paragraph</p>
    <span>Keep this</span>
</div>
'''
soup = BeautifulSoup(html_doc, 'html.parser')

old_p = soup.find('p', class_='old')

# Create new tag with attributes
new_tag = soup.new_tag('article', **{'class': 'new', 'data-type': 'content'})
new_tag.string = 'New article content'

# Replace old with new
old_p.replace_with(new_tag)

print(soup.prettify())

Creating Complex New Tags

from bs4 import BeautifulSoup

soup = BeautifulSoup('<div></div>', 'html.parser')

# Create nested structure
article = soup.new_tag('article')
header = soup.new_tag('header')
title = soup.new_tag('h1')
title.string = 'Article Title'

header.append(title)
article.append(header)

# Add content
content = soup.new_tag('div', **{'class': 'content'})
content.string = 'Article content here'
article.append(content)

soup.div.replace_with(article)
print(soup.prettify())

Advanced Content Manipulation

Inserting Content at Specific Positions

from bs4 import BeautifulSoup

html_doc = '<ul><li>Item 1</li><li>Item 3</li></ul>'
soup = BeautifulSoup(html_doc, 'html.parser')

ul_tag = soup.find('ul')

# Insert at specific position
new_li = soup.new_tag('li')
new_li.string = 'Item 2'
ul_tag.insert(1, new_li)  # Insert at index 1

# Insert at beginning
first_li = soup.new_tag('li')
first_li.string = 'Item 0'
ul_tag.insert(0, first_li)

print(soup.prettify())

Wrapping and Unwrapping Elements

from bs4 import BeautifulSoup

html_doc = '<p>Text to wrap</p>'
soup = BeautifulSoup(html_doc, 'html.parser')

p_tag = soup.find('p')

# Wrap with new tag
wrapper = soup.new_tag('div', **{'class': 'wrapper'})
p_tag.wrap(wrapper)

print(soup)
# Output: <div class="wrapper"><p>Text to wrap</p></div>

# Unwrap (remove wrapper, keep content)
wrapper.unwrap()
print(soup)
# Output: <p>Text to wrap</p>

Removing Content

Different Removal Methods

from bs4 import BeautifulSoup

html_doc = '''
<div>
    <p class="remove">Remove completely</p>
    <p class="extract">Extract this</p>
    <p class="clear">Clear content</p>
</div>
'''
soup = BeautifulSoup(html_doc, 'html.parser')

# decompose() - completely destroys element
soup.find('p', class_='remove').decompose()

# extract() - removes but keeps in memory
extracted = soup.find('p', class_='extract').extract()
print(f"Extracted: {extracted}")

# clear() - removes content but keeps tag
soup.find('p', class_='clear').clear()

print(soup.prettify())

Practical Examples

Updating Links in HTML

from bs4 import BeautifulSoup
import re

html_doc = '''
<div>
    <a href="http://old-domain.com/page1">Link 1</a>
    <a href="http://old-domain.com/page2">Link 2</a>
    <a href="http://other-site.com">External</a>
</div>
'''
soup = BeautifulSoup(html_doc, 'html.parser')

# Update specific domain links
for link in soup.find_all('a', href=re.compile(r'old-domain\.com')):
    old_href = link['href']
    new_href = old_href.replace('old-domain.com', 'new-domain.com')
    link['href'] = new_href

    # Add migration indicator
    link['data-migrated'] = 'true'

print(soup.prettify())

Converting Table Structure

from bs4 import BeautifulSoup

html_doc = '''
<table>
    <tr><td>Name</td><td>Age</td></tr>
    <tr><td>John</td><td>30</td></tr>
</table>
'''
soup = BeautifulSoup(html_doc, 'html.parser')

table = soup.find('table')

# Convert to div-based layout
div_container = soup.new_tag('div', **{'class': 'table-container'})

for row in table.find_all('tr'):
    row_div = soup.new_tag('div', **{'class': 'table-row'})

    for cell in row.find_all('td'):
        cell_div = soup.new_tag('div', **{'class': 'table-cell'})
        cell_div.string = cell.get_text()
        row_div.append(cell_div)

    div_container.append(row_div)

table.replace_with(div_container)
print(soup.prettify())

Best Practices

  1. Always check if elements exist before modifying:
   element = soup.find('target')
   if element:
       element.string = 'New content'
  1. Use appropriate removal methods:

    • .decompose() for permanent removal
    • .extract() when you might reuse the element
    • .clear() to empty content while keeping the tag
  2. Handle encoding properly:

   # Convert back to string when needed
   modified_html = str(soup)

   # Or with specific encoding
   modified_html = soup.encode('utf-8')
  1. Preserve document structure when making bulk changes by working with copies when necessary.

Beautiful Soup's modification capabilities make it an excellent choice for HTML/XML document transformation and content management tasks.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon