Web scraping and saving data to CSV files is a common task for data extraction and analysis. Python provides excellent tools for this purpose, combining requests for HTTP requests, BeautifulSoup for HTML parsing, and the built-in csv module for file writing.
Prerequisites and Setup
Required Libraries
Install the necessary packages using pip:
pip install requests beautifulsoup4 lxml pandas
- requests: HTTP library for making web requests
- beautifulsoup4: HTML/XML parser for extracting data
- lxml: Fast XML and HTML parser (optional but recommended)
- pandas: Data manipulation library (alternative to csv module)
Basic Web Scraping to CSV
Method 1: Using csv Module (Lightweight)
import csv
import requests
from bs4 import BeautifulSoup
import time
def scrape_to_csv(url, output_file):
    try:
        # Add headers to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        # Make HTTP request with timeout
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises exception for bad status codes
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'lxml')
        # Example: Scraping a table
        table = soup.find('table', {'class': 'data-table'})
        if not table:
            print("No table found with the specified selector")
            return
        rows = table.find_all('tr')
        # Write to CSV file
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            # Extract and write headers
            if rows:
                headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]
                writer.writerow(headers)
                # Extract and write data rows
                for row in rows[1:]:
                    data = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                    writer.writerow(data)
        print(f"Data successfully scraped and saved to {output_file}")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
# Usage
scrape_to_csv('https://example.com/data-table', 'scraped_data.csv')
Method 2: Using Pandas (More Features)
import pandas as pd
import requests
from bs4 import BeautifulSoup
def scrape_with_pandas(url, output_file):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        # Extract data into a list of dictionaries
        data = []
        # Example: Scraping product information
        products = soup.find_all('div', {'class': 'product-item'})
        for product in products:
            item = {
                'name': product.find('h3', {'class': 'product-name'}).get_text(strip=True) if product.find('h3', {'class': 'product-name'}) else 'N/A',
                'price': product.find('span', {'class': 'price'}).get_text(strip=True) if product.find('span', {'class': 'price'}) else 'N/A',
                'rating': product.find('div', {'class': 'rating'}).get_text(strip=True) if product.find('div', {'class': 'rating'}) else 'N/A'
            }
            data.append(item)
        # Create DataFrame and save to CSV
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Data scraped and saved to {output_file}")
        print(f"Total records: {len(data)}")
    except Exception as e:
        print(f"Error: {e}")
# Usage
scrape_with_pandas('https://example.com/products', 'products.csv')
Advanced Scraping Techniques
Handling Multiple Pages
import csv
import requests
from bs4 import BeautifulSoup
import time
def scrape_multiple_pages(base_url, max_pages, output_file):
    all_data = []
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        headers_written = False
        for page in range(1, max_pages + 1):
            url = f"{base_url}?page={page}"
            print(f"Scraping page {page}...")
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'lxml')
                rows = soup.find_all('tr', {'class': 'data-row'})
                if not rows:
                    print(f"No data found on page {page}")
                    break
                # Write headers only once
                if not headers_written and rows:
                    headers = ['Column1', 'Column2', 'Column3']  # Define your headers
                    writer.writerow(headers)
                    headers_written = True
                # Extract and write data
                for row in rows:
                    data = [cell.get_text(strip=True) for cell in row.find_all('td')]
                    writer.writerow(data)
                # Be respectful - add delay between requests
                time.sleep(1)
            except Exception as e:
                print(f"Error on page {page}: {e}")
                continue
    print(f"Scraping completed. Data saved to {output_file}")
# Usage
scrape_multiple_pages('https://example.com/data', 5, 'multi_page_data.csv')
Handling Forms and POST Requests
import csv
import requests
from bs4 import BeautifulSoup
def scrape_with_form_data(url, form_data, output_file):
    session = requests.Session()
    try:
        # First, get the page to extract any CSRF tokens or hidden fields
        response = session.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        # Extract CSRF token if present
        csrf_token = soup.find('input', {'name': 'csrf_token'})
        if csrf_token:
            form_data['csrf_token'] = csrf_token.get('value')
        # Submit form data
        post_response = session.post(url, data=form_data)
        post_response.raise_for_status()
        # Parse results
        result_soup = BeautifulSoup(post_response.content, 'lxml')
        # Extract data and save to CSV
        results = result_soup.find_all('div', {'class': 'search-result'})
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Title', 'Description', 'URL'])
            for result in results:
                title = result.find('h3').get_text(strip=True) if result.find('h3') else 'N/A'
                desc = result.find('p').get_text(strip=True) if result.find('p') else 'N/A'
                link = result.find('a')['href'] if result.find('a') else 'N/A'
                writer.writerow([title, desc, link])
        print(f"Form data scraped and saved to {output_file}")
    except Exception as e:
        print(f"Error: {e}")
# Usage
form_data = {
    'search_query': 'python web scraping',
    'category': 'programming'
}
scrape_with_form_data('https://example.com/search', form_data, 'search_results.csv')
Best Practices and Error Handling
Robust Scraping Function
import csv
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse
class WebScraper:
    def __init__(self, delay_range=(1, 3)):
        self.session = requests.Session()
        self.delay_range = delay_range
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
    def scrape_to_csv(self, url, selectors, output_file, max_retries=3):
        """
        Scrape data using CSS selectors and save to CSV
        Args:
            url: Target URL
            selectors: Dict mapping column names to CSS selectors
            output_file: Output CSV filename
            max_retries: Number of retry attempts
        """
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=15)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'lxml')
                # Extract data using selectors
                data = []
                items = soup.find_all(selectors.get('container', 'div'))
                for item in items:
                    row = {}
                    for column, selector in selectors.items():
                        if column == 'container':
                            continue
                        element = item.select_one(selector)
                        row[column] = element.get_text(strip=True) if element else 'N/A'
                    data.append(row)
                # Save to CSV
                if data:
                    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                        writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
                        writer.writeheader()
                        writer.writerows(data)
                    print(f"Successfully scraped {len(data)} items to {output_file}")
                    return True
                else:
                    print("No data found")
                    return False
            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(random.uniform(*self.delay_range))
        print(f"Failed to scrape after {max_retries} attempts")
        return False
    def random_delay(self):
        """Add random delay between requests"""
        time.sleep(random.uniform(*self.delay_range))
# Usage example
scraper = WebScraper()
selectors = {
    'container': 'div.product',
    'name': 'h3.product-title',
    'price': 'span.price',
    'rating': 'div.rating span'
}
scraper.scrape_to_csv('https://example.com/products', selectors, 'products.csv')
Common Issues and Solutions
1. JavaScript-Rendered Content
For sites that load content dynamically with JavaScript, use Selenium:
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import time
def scrape_js_content(url, output_file):
    # Setup Chrome driver (install chromedriver first)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in background
    driver = webdriver.Chrome(options=options)
    try:
        driver.get(url)
        time.sleep(5)  # Wait for JS to load
        # Find elements after JS execution
        items = driver.find_elements(By.CLASS_NAME, 'dynamic-item')
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Title', 'Content'])
            for item in items:
                title = item.find_element(By.TAG_NAME, 'h3').text
                content = item.find_element(By.TAG_NAME, 'p').text
                writer.writerow([title, content])
    finally:
        driver.quit()
2. Rate Limiting and Politeness
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def create_session_with_retries():
    session = requests.Session()
    # Configure retry strategy
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session
Legal and Ethical Considerations
- Check robots.txt: Always review https://example.com/robots.txt
- Respect rate limits: Add delays between requests
- Terms of Service: Read and comply with website terms
- Personal data: Be careful with personally identifiable information
- API alternatives: Check if the site offers an API
Summary
Web scraping to CSV in Python involves:
- Making HTTP requests with proper headers and error handling
- Parsing HTML content using BeautifulSoup or similar libraries
- Extracting data with CSS selectors or XPath
- Writing to CSV using the csv module or pandas
- Handling edge cases like JavaScript content and rate limiting
Choose the method that best fits your needs - use the csv module for simple tasks and pandas for more complex data manipulation requirements.