Table of contents

How do I use Beautiful Soup to extract all image sources from a webpage?

Beautiful Soup is a Python library that makes web scraping straightforward by providing intuitive methods to parse and navigate HTML/XML documents. Extracting image sources is one of the most common web scraping tasks.

Quick Answer

To extract all image sources from a webpage using Beautiful Soup:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract all image URLs
image_urls = []
for img in soup.find_all('img'):
    src = img.get('src')
    if src:
        image_urls.append(urljoin(url, src))

print(image_urls)

Step-by-Step Guide

1. Installation

Install the required packages:

pip install beautifulsoup4 requests lxml

2. Basic Image Extraction

Here's a comprehensive function to extract image URLs:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def extract_image_urls(url, include_data_urls=False):
    """
    Extract all image URLs from a webpage

    Args:
        url (str): The webpage URL to scrape
        include_data_urls (bool): Whether to include data: URLs

    Returns:
        list: List of image URLs
    """
    try:
        # Add headers to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        image_urls = []

        # Find all img tags
        for img in soup.find_all('img'):
            # Check both src and data-src attributes
            src = img.get('src') or img.get('data-src')

            if src:
                # Skip data URLs unless requested
                if src.startswith('data:') and not include_data_urls:
                    continue

                # Convert relative URLs to absolute
                if not src.startswith(('http://', 'https://', 'data:')):
                    src = urljoin(url, src)

                image_urls.append(src)

        return image_urls

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Usage
url = 'https://example.com'
images = extract_image_urls(url)
print(f"Found {len(images)} images:")
for img_url in images:
    print(img_url)

3. Advanced Image Extraction with Filtering

Extract images with additional metadata and filtering options:

def extract_images_with_details(url, min_size=None, file_types=None):
    """
    Extract images with additional details and filtering

    Args:
        url (str): The webpage URL
        min_size (tuple): Minimum (width, height) in pixels
        file_types (list): List of allowed file extensions

    Returns:
        list: List of dictionaries with image details
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        images = []

        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if not src:
                continue

            # Convert relative URLs
            if not src.startswith(('http://', 'https://', 'data:')):
                src = urljoin(url, src)

            # Filter by file type
            if file_types:
                file_ext = src.lower().split('.')[-1].split('?')[0]
                if file_ext not in file_types:
                    continue

            # Get image details
            image_info = {
                'url': src,
                'alt': img.get('alt', ''),
                'title': img.get('title', ''),
                'width': img.get('width'),
                'height': img.get('height'),
                'class': img.get('class', []),
                'id': img.get('id', '')
            }

            # Filter by minimum size
            if min_size and image_info['width'] and image_info['height']:
                try:
                    width = int(image_info['width'])
                    height = int(image_info['height'])
                    if width < min_size[0] or height < min_size[1]:
                        continue
                except ValueError:
                    pass

            images.append(image_info)

        return images

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Usage examples
url = 'https://example.com'

# Get all images
all_images = extract_images_with_details(url)

# Get only large JPG/PNG images
large_images = extract_images_with_details(
    url, 
    min_size=(200, 200), 
    file_types=['jpg', 'jpeg', 'png']
)

print(f"Found {len(large_images)} large images")
for img in large_images:
    print(f"URL: {img['url']}")
    print(f"Alt text: {img['alt']}")
    print(f"Dimensions: {img['width']}x{img['height']}")
    print("---")

4. Handling Lazy-Loaded Images

Many modern websites use lazy loading. Here's how to handle common patterns:

def extract_lazy_loaded_images(url):
    """Extract images including lazy-loaded ones"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        image_urls = set()  # Use set to avoid duplicates

        # Common lazy loading attributes
        lazy_attrs = ['data-src', 'data-lazy', 'data-original', 'data-url']

        for img in soup.find_all('img'):
            # Check standard src first
            src = img.get('src')
            if src and not src.startswith('data:image'):
                image_urls.add(urljoin(url, src))

            # Check lazy loading attributes
            for attr in lazy_attrs:
                lazy_src = img.get(attr)
                if lazy_src and not lazy_src.startswith('data:image'):
                    image_urls.add(urljoin(url, lazy_src))

        # Also check for images in srcset attributes
        for img in soup.find_all('img'):
            srcset = img.get('srcset')
            if srcset:
                # Parse srcset (format: "url1 1x, url2 2x" or "url1 400w, url2 800w")
                for src_item in srcset.split(','):
                    src_url = src_item.strip().split()[0]
                    image_urls.add(urljoin(url, src_url))

        return list(image_urls)

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Usage
lazy_images = extract_lazy_loaded_images('https://example.com')
print(f"Found {len(lazy_images)} images (including lazy-loaded)")

5. Extracting Background Images from CSS

Some images are set as CSS background images:

import re

def extract_background_images(url):
    """Extract background images from CSS"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        background_images = []

        # Extract from inline styles
        elements_with_style = soup.find_all(attrs={'style': True})

        for element in elements_with_style:
            style = element.get('style', '')
            # Find background-image URLs
            bg_urls = re.findall(r'background-image:\s*url\(["\']?([^"\']+)["\']?\)', style)

            for bg_url in bg_urls:
                if not bg_url.startswith(('http://', 'https://')):
                    bg_url = urljoin(url, bg_url)
                background_images.append(bg_url)

        return background_images

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

Common Issues and Solutions

1. Handling Relative URLs

Always use urllib.parse.urljoin() to convert relative URLs to absolute ones:

from urllib.parse import urljoin

# Wrong
image_url = '/images/photo.jpg'

# Correct
base_url = 'https://example.com'
image_url = urljoin(base_url, '/images/photo.jpg')
# Result: 'https://example.com/images/photo.jpg'

2. Adding Request Headers

Some websites block requests without proper headers:

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}

response = requests.get(url, headers=headers)

3. Error Handling

Always include proper error handling:

try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()  # Raises an HTTPError for bad responses
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")
    return []

Best Practices

  1. Respect robots.txt: Check the website's robots.txt file before scraping
  2. Add delays: Use time.sleep() between requests to avoid overwhelming servers
  3. Handle timeouts: Set reasonable timeout values for requests
  4. Use sessions: For multiple requests to the same domain, use requests.Session()
  5. Validate URLs: Check if extracted URLs are valid before using them
  6. Consider legal implications: Ensure you have permission to scrape and use the images

Example: Complete Image Scraper Script

#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time

def download_images(url, download_dir='images', delay=1):
    """
    Complete example: Extract and download images from a webpage
    """
    # Create download directory
    os.makedirs(download_dir, exist_ok=True)

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        image_urls = []

        # Extract image URLs
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src and not src.startswith('data:'):
                full_url = urljoin(url, src)
                image_urls.append(full_url)

        print(f"Found {len(image_urls)} images")

        # Download images
        for i, img_url in enumerate(image_urls):
            try:
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()

                # Get filename from URL
                filename = os.path.basename(urlparse(img_url).path)
                if not filename or '.' not in filename:
                    filename = f"image_{i+1}.jpg"

                filepath = os.path.join(download_dir, filename)

                with open(filepath, 'wb') as f:
                    f.write(img_response.content)

                print(f"Downloaded: {filename}")
                time.sleep(delay)  # Be respectful

            except Exception as e:
                print(f"Failed to download {img_url}: {e}")

    except requests.RequestException as e:
        print(f"Error fetching webpage: {e}")

# Usage
if __name__ == "__main__":
    url = "https://example.com"
    download_images(url)

This comprehensive guide covers all the essential techniques for extracting image sources from webpages using Beautiful Soup, including handling modern web development patterns like lazy loading and CSS background images.

Try WebScraping.AI for Your Web Scraping Needs

Looking for a powerful web scraping solution? WebScraping.AI provides an LLM-powered API that combines Chromium JavaScript rendering with rotating proxies for reliable data extraction.

Key Features:

  • AI-powered extraction: Ask questions about web pages or extract structured data fields
  • JavaScript rendering: Full Chromium browser support for dynamic content
  • Rotating proxies: Datacenter and residential proxies from multiple countries
  • Easy integration: Simple REST API with SDKs for Python, Ruby, PHP, and more
  • Reliable & scalable: Built for developers who need consistent results

Getting Started:

Get page content with AI analysis:

curl "https://api.webscraping.ai/ai/question?url=https://example.com&question=What is the main topic?&api_key=YOUR_API_KEY"

Extract structured data:

curl "https://api.webscraping.ai/ai/fields?url=https://example.com&fields[title]=Page title&fields[price]=Product price&api_key=YOUR_API_KEY"

Try in request builder

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon