Beautiful Soup is a Python library that makes web scraping straightforward by providing intuitive methods to parse and navigate HTML/XML documents. Extracting image sources is one of the most common web scraping tasks.
Quick Answer
To extract all image sources from a webpage using Beautiful Soup:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract all image URLs
image_urls = []
for img in soup.find_all('img'):
    src = img.get('src')
    if src:
        image_urls.append(urljoin(url, src))
print(image_urls)
Step-by-Step Guide
1. Installation
Install the required packages:
pip install beautifulsoup4 requests lxml
2. Basic Image Extraction
Here's a comprehensive function to extract image URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def extract_image_urls(url, include_data_urls=False):
    """
    Extract all image URLs from a webpage
    Args:
        url (str): The webpage URL to scrape
        include_data_urls (bool): Whether to include data: URLs
    Returns:
        list: List of image URLs
    """
    try:
        # Add headers to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        image_urls = []
        # Find all img tags
        for img in soup.find_all('img'):
            # Check both src and data-src attributes
            src = img.get('src') or img.get('data-src')
            if src:
                # Skip data URLs unless requested
                if src.startswith('data:') and not include_data_urls:
                    continue
                # Convert relative URLs to absolute
                if not src.startswith(('http://', 'https://', 'data:')):
                    src = urljoin(url, src)
                image_urls.append(src)
        return image_urls
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
# Usage
url = 'https://example.com'
images = extract_image_urls(url)
print(f"Found {len(images)} images:")
for img_url in images:
    print(img_url)
3. Advanced Image Extraction with Filtering
Extract images with additional metadata and filtering options:
def extract_images_with_details(url, min_size=None, file_types=None):
    """
    Extract images with additional details and filtering
    Args:
        url (str): The webpage URL
        min_size (tuple): Minimum (width, height) in pixels
        file_types (list): List of allowed file extensions
    Returns:
        list: List of dictionaries with image details
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        images = []
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if not src:
                continue
            # Convert relative URLs
            if not src.startswith(('http://', 'https://', 'data:')):
                src = urljoin(url, src)
            # Filter by file type
            if file_types:
                file_ext = src.lower().split('.')[-1].split('?')[0]
                if file_ext not in file_types:
                    continue
            # Get image details
            image_info = {
                'url': src,
                'alt': img.get('alt', ''),
                'title': img.get('title', ''),
                'width': img.get('width'),
                'height': img.get('height'),
                'class': img.get('class', []),
                'id': img.get('id', '')
            }
            # Filter by minimum size
            if min_size and image_info['width'] and image_info['height']:
                try:
                    width = int(image_info['width'])
                    height = int(image_info['height'])
                    if width < min_size[0] or height < min_size[1]:
                        continue
                except ValueError:
                    pass
            images.append(image_info)
        return images
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
# Usage examples
url = 'https://example.com'
# Get all images
all_images = extract_images_with_details(url)
# Get only large JPG/PNG images
large_images = extract_images_with_details(
    url, 
    min_size=(200, 200), 
    file_types=['jpg', 'jpeg', 'png']
)
print(f"Found {len(large_images)} large images")
for img in large_images:
    print(f"URL: {img['url']}")
    print(f"Alt text: {img['alt']}")
    print(f"Dimensions: {img['width']}x{img['height']}")
    print("---")
4. Handling Lazy-Loaded Images
Many modern websites use lazy loading. Here's how to handle common patterns:
def extract_lazy_loaded_images(url):
    """Extract images including lazy-loaded ones"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        image_urls = set()  # Use set to avoid duplicates
        # Common lazy loading attributes
        lazy_attrs = ['data-src', 'data-lazy', 'data-original', 'data-url']
        for img in soup.find_all('img'):
            # Check standard src first
            src = img.get('src')
            if src and not src.startswith('data:image'):
                image_urls.add(urljoin(url, src))
            # Check lazy loading attributes
            for attr in lazy_attrs:
                lazy_src = img.get(attr)
                if lazy_src and not lazy_src.startswith('data:image'):
                    image_urls.add(urljoin(url, lazy_src))
        # Also check for images in srcset attributes
        for img in soup.find_all('img'):
            srcset = img.get('srcset')
            if srcset:
                # Parse srcset (format: "url1 1x, url2 2x" or "url1 400w, url2 800w")
                for src_item in srcset.split(','):
                    src_url = src_item.strip().split()[0]
                    image_urls.add(urljoin(url, src_url))
        return list(image_urls)
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
# Usage
lazy_images = extract_lazy_loaded_images('https://example.com')
print(f"Found {len(lazy_images)} images (including lazy-loaded)")
5. Extracting Background Images from CSS
Some images are set as CSS background images:
import re
def extract_background_images(url):
    """Extract background images from CSS"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        background_images = []
        # Extract from inline styles
        elements_with_style = soup.find_all(attrs={'style': True})
        for element in elements_with_style:
            style = element.get('style', '')
            # Find background-image URLs
            bg_urls = re.findall(r'background-image:\s*url\(["\']?([^"\']+)["\']?\)', style)
            for bg_url in bg_urls:
                if not bg_url.startswith(('http://', 'https://')):
                    bg_url = urljoin(url, bg_url)
                background_images.append(bg_url)
        return background_images
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
Common Issues and Solutions
1. Handling Relative URLs
Always use urllib.parse.urljoin() to convert relative URLs to absolute ones:
from urllib.parse import urljoin
# Wrong
image_url = '/images/photo.jpg'
# Correct
base_url = 'https://example.com'
image_url = urljoin(base_url, '/images/photo.jpg')
# Result: 'https://example.com/images/photo.jpg'
2. Adding Request Headers
Some websites block requests without proper headers:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}
response = requests.get(url, headers=headers)
3. Error Handling
Always include proper error handling:
try:
    response = requests.get(url, timeout=10)
    response.raise_for_status()  # Raises an HTTPError for bad responses
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")
    return []
Best Practices
- Respect robots.txt: Check the website's robots.txt file before scraping
- Add delays: Use time.sleep() between requests to avoid overwhelming servers
- Handle timeouts: Set reasonable timeout values for requests
- Use sessions: For multiple requests to the same domain, use requests.Session()
- Validate URLs: Check if extracted URLs are valid before using them
- Consider legal implications: Ensure you have permission to scrape and use the images
Example: Complete Image Scraper Script
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time
def download_images(url, download_dir='images', delay=1):
    """
    Complete example: Extract and download images from a webpage
    """
    # Create download directory
    os.makedirs(download_dir, exist_ok=True)
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        image_urls = []
        # Extract image URLs
        for img in soup.find_all('img'):
            src = img.get('src') or img.get('data-src')
            if src and not src.startswith('data:'):
                full_url = urljoin(url, src)
                image_urls.append(full_url)
        print(f"Found {len(image_urls)} images")
        # Download images
        for i, img_url in enumerate(image_urls):
            try:
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()
                # Get filename from URL
                filename = os.path.basename(urlparse(img_url).path)
                if not filename or '.' not in filename:
                    filename = f"image_{i+1}.jpg"
                filepath = os.path.join(download_dir, filename)
                with open(filepath, 'wb') as f:
                    f.write(img_response.content)
                print(f"Downloaded: {filename}")
                time.sleep(delay)  # Be respectful
            except Exception as e:
                print(f"Failed to download {img_url}: {e}")
    except requests.RequestException as e:
        print(f"Error fetching webpage: {e}")
# Usage
if __name__ == "__main__":
    url = "https://example.com"
    download_images(url)
This comprehensive guide covers all the essential techniques for extracting image sources from webpages using Beautiful Soup, including handling modern web development patterns like lazy loading and CSS background images.