After scraping data with Beautiful Soup, you'll often want to save the extracted information to a file for analysis or storage. Here are comprehensive examples for the most common file formats.
CSV Files - Best for Tabular Data
CSV (Comma-Separated Values) is ideal for structured data that can be opened in Excel or imported into databases.
Basic CSV Example
import csv
from bs4 import BeautifulSoup
import requests
# Scrape a table from a webpage
url = 'https://example.com/table-page'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract table data
data = []
table = soup.find('table')
if table:
rows = table.find_all('tr')
for row in rows:
cols = row.find_all(['td', 'th'])
row_data = [col.get_text(strip=True) for col in cols]
data.append(row_data)
# Save to CSV
with open('scraped_table.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(data)
Advanced CSV with Headers
import csv
from bs4 import BeautifulSoup
import requests
# Scrape product information
url = 'https://example.com/products'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract structured data
products = []
for product in soup.find_all('div', class_='product'):
name = product.find('h3').get_text(strip=True) if product.find('h3') else 'N/A'
price = product.find('span', class_='price').get_text(strip=True) if product.find('span', class_='price') else 'N/A'
rating = product.find('div', class_='rating')['data-rating'] if product.find('div', class_='rating') else 'N/A'
products.append([name, price, rating])
# Save with headers
with open('products.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Product Name', 'Price', 'Rating']) # Headers
writer.writerows(products)
JSON Files - Best for Structured Data
JSON is perfect for nested data structures and is easily readable by web applications.
Basic JSON Example
import json
from bs4 import BeautifulSoup
import requests
# Scrape article data
url = 'https://example.com/articles'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract articles as a list of dictionaries
articles = []
for article in soup.find_all('article'):
title = article.find('h2').get_text(strip=True) if article.find('h2') else None
author = article.find('span', class_='author').get_text(strip=True) if article.find('span', class_='author') else None
date = article.find('time')['datetime'] if article.find('time') else None
content = article.find('p').get_text(strip=True) if article.find('p') else None
articles.append({
'title': title,
'author': author,
'date': date,
'content': content
})
# Save to JSON
with open('articles.json', 'w', encoding='utf-8') as jsonfile:
json.dump(articles, jsonfile, ensure_ascii=False, indent=2)
Nested JSON Structure
import json
from bs4 import BeautifulSoup
import requests
# Scrape nested data (e.g., categories with products)
url = 'https://example.com/categories'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
data = {
'scrape_date': '2024-01-01',
'categories': []
}
for category in soup.find_all('div', class_='category'):
category_name = category.find('h2').get_text(strip=True)
products = []
for product in category.find_all('div', class_='product'):
products.append({
'name': product.find('h3').get_text(strip=True),
'price': product.find('span', class_='price').get_text(strip=True),
'url': product.find('a')['href'] if product.find('a') else None
})
data['categories'].append({
'name': category_name,
'products': products
})
# Save nested structure
with open('categories.json', 'w', encoding='utf-8') as jsonfile:
json.dump(data, jsonfile, ensure_ascii=False, indent=2)
Text Files - Best for Simple Content
Text files are useful for saving scraped content, logs, or simple lists.
Basic Text Example
from bs4 import BeautifulSoup
import requests
# Scrape article content
url = 'https://example.com/article'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract text content
title = soup.find('h1').get_text(strip=True) if soup.find('h1') else 'No Title'
paragraphs = soup.find_all('p')
content = '\n'.join([p.get_text(strip=True) for p in paragraphs])
# Save to text file
with open('article.txt', 'w', encoding='utf-8') as textfile:
textfile.write(f"Title: {title}\n")
textfile.write("=" * 50 + "\n\n")
textfile.write(content)
Formatted Text Output
from bs4 import BeautifulSoup
import requests
from datetime import datetime
# Scrape multiple articles
url = 'https://example.com/news'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
with open('news_articles.txt', 'w', encoding='utf-8') as textfile:
textfile.write(f"News Articles - Scraped on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
textfile.write("=" * 80 + "\n\n")
for i, article in enumerate(soup.find_all('article'), 1):
title = article.find('h2').get_text(strip=True) if article.find('h2') else 'No Title'
summary = article.find('p').get_text(strip=True) if article.find('p') else 'No Summary'
textfile.write(f"{i}. {title}\n")
textfile.write("-" * 40 + "\n")
textfile.write(f"{summary}\n\n")
Excel Files - Best for Business Use
For Excel compatibility, use the pandas
library or openpyxl
.
Using Pandas for Excel
import pandas as pd
from bs4 import BeautifulSoup
import requests
# Scrape data
url = 'https://example.com/data'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract data into a list of dictionaries
data = []
for item in soup.find_all('div', class_='item'):
data.append({
'Name': item.find('h3').get_text(strip=True) if item.find('h3') else 'N/A',
'Price': item.find('span', class_='price').get_text(strip=True) if item.find('span', class_='price') else 'N/A',
'Description': item.find('p').get_text(strip=True) if item.find('p') else 'N/A'
})
# Create DataFrame and save to Excel
df = pd.DataFrame(data)
df.to_excel('scraped_data.xlsx', index=False)
Best Practices
Error Handling
import csv
from bs4 import BeautifulSoup
import requests
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def scrape_and_save(url, filename):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
data = []
for item in soup.find_all('div', class_='item'):
try:
name = item.find('h3').get_text(strip=True)
data.append([name])
except AttributeError:
logger.warning(f"Skipping item due to missing data")
continue
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Name'])
writer.writerows(data)
logger.info(f"Successfully saved {len(data)} items to {filename}")
except requests.RequestException as e:
logger.error(f"Request failed: {e}")
except IOError as e:
logger.error(f"File operation failed: {e}")
# Usage
scrape_and_save('https://example.com', 'output.csv')
Appending to Existing Files
import csv
from bs4 import BeautifulSoup
import requests
# Append new data to existing CSV
def append_to_csv(filename, new_data):
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(new_data)
# Check if file exists and write headers if needed
import os
if not os.path.exists('data.csv'):
with open('data.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Name', 'Price']) # Headers
Key Points to Remember
- Always use UTF-8 encoding to handle international characters
- Include error handling for robust scraping
- Validate data before saving to avoid empty or malformed entries
- Use appropriate file formats: CSV for tabular data, JSON for structured data, text for simple content
- Add timestamps to track when data was scraped
- Consider file size - split large datasets into multiple files if needed