Beautiful Soup is a Python library that makes web scraping straightforward by providing intuitive methods to parse and navigate HTML/XML documents. Extracting image sources is one of the most common web scraping tasks.
Quick Answer
To extract all image sources from a webpage using Beautiful Soup:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract all image URLs
image_urls = []
for img in soup.find_all('img'):
src = img.get('src')
if src:
image_urls.append(urljoin(url, src))
print(image_urls)
Step-by-Step Guide
1. Installation
Install the required packages:
pip install beautifulsoup4 requests lxml
2. Basic Image Extraction
Here's a comprehensive function to extract image URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def extract_image_urls(url, include_data_urls=False):
"""
Extract all image URLs from a webpage
Args:
url (str): The webpage URL to scrape
include_data_urls (bool): Whether to include data: URLs
Returns:
list: List of image URLs
"""
try:
# Add headers to avoid being blocked
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
image_urls = []
# Find all img tags
for img in soup.find_all('img'):
# Check both src and data-src attributes
src = img.get('src') or img.get('data-src')
if src:
# Skip data URLs unless requested
if src.startswith('data:') and not include_data_urls:
continue
# Convert relative URLs to absolute
if not src.startswith(('http://', 'https://', 'data:')):
src = urljoin(url, src)
image_urls.append(src)
return image_urls
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
# Usage
url = 'https://example.com'
images = extract_image_urls(url)
print(f"Found {len(images)} images:")
for img_url in images:
print(img_url)
3. Advanced Image Extraction with Filtering
Extract images with additional metadata and filtering options:
def extract_images_with_details(url, min_size=None, file_types=None):
"""
Extract images with additional details and filtering
Args:
url (str): The webpage URL
min_size (tuple): Minimum (width, height) in pixels
file_types (list): List of allowed file extensions
Returns:
list: List of dictionaries with image details
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
images = []
for img in soup.find_all('img'):
src = img.get('src') or img.get('data-src')
if not src:
continue
# Convert relative URLs
if not src.startswith(('http://', 'https://', 'data:')):
src = urljoin(url, src)
# Filter by file type
if file_types:
file_ext = src.lower().split('.')[-1].split('?')[0]
if file_ext not in file_types:
continue
# Get image details
image_info = {
'url': src,
'alt': img.get('alt', ''),
'title': img.get('title', ''),
'width': img.get('width'),
'height': img.get('height'),
'class': img.get('class', []),
'id': img.get('id', '')
}
# Filter by minimum size
if min_size and image_info['width'] and image_info['height']:
try:
width = int(image_info['width'])
height = int(image_info['height'])
if width < min_size[0] or height < min_size[1]:
continue
except ValueError:
pass
images.append(image_info)
return images
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
# Usage examples
url = 'https://example.com'
# Get all images
all_images = extract_images_with_details(url)
# Get only large JPG/PNG images
large_images = extract_images_with_details(
url,
min_size=(200, 200),
file_types=['jpg', 'jpeg', 'png']
)
print(f"Found {len(large_images)} large images")
for img in large_images:
print(f"URL: {img['url']}")
print(f"Alt text: {img['alt']}")
print(f"Dimensions: {img['width']}x{img['height']}")
print("---")
4. Handling Lazy-Loaded Images
Many modern websites use lazy loading. Here's how to handle common patterns:
def extract_lazy_loaded_images(url):
"""Extract images including lazy-loaded ones"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
image_urls = set() # Use set to avoid duplicates
# Common lazy loading attributes
lazy_attrs = ['data-src', 'data-lazy', 'data-original', 'data-url']
for img in soup.find_all('img'):
# Check standard src first
src = img.get('src')
if src and not src.startswith('data:image'):
image_urls.add(urljoin(url, src))
# Check lazy loading attributes
for attr in lazy_attrs:
lazy_src = img.get(attr)
if lazy_src and not lazy_src.startswith('data:image'):
image_urls.add(urljoin(url, lazy_src))
# Also check for images in srcset attributes
for img in soup.find_all('img'):
srcset = img.get('srcset')
if srcset:
# Parse srcset (format: "url1 1x, url2 2x" or "url1 400w, url2 800w")
for src_item in srcset.split(','):
src_url = src_item.strip().split()[0]
image_urls.add(urljoin(url, src_url))
return list(image_urls)
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
# Usage
lazy_images = extract_lazy_loaded_images('https://example.com')
print(f"Found {len(lazy_images)} images (including lazy-loaded)")
5. Extracting Background Images from CSS
Some images are set as CSS background images:
import re
def extract_background_images(url):
"""Extract background images from CSS"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
background_images = []
# Extract from inline styles
elements_with_style = soup.find_all(attrs={'style': True})
for element in elements_with_style:
style = element.get('style', '')
# Find background-image URLs
bg_urls = re.findall(r'background-image:\s*url\(["\']?([^"\']+)["\']?\)', style)
for bg_url in bg_urls:
if not bg_url.startswith(('http://', 'https://')):
bg_url = urljoin(url, bg_url)
background_images.append(bg_url)
return background_images
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return []
Common Issues and Solutions
1. Handling Relative URLs
Always use urllib.parse.urljoin()
to convert relative URLs to absolute ones:
from urllib.parse import urljoin
# Wrong
image_url = '/images/photo.jpg'
# Correct
base_url = 'https://example.com'
image_url = urljoin(base_url, '/images/photo.jpg')
# Result: 'https://example.com/images/photo.jpg'
2. Adding Request Headers
Some websites block requests without proper headers:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
response = requests.get(url, headers=headers)
3. Error Handling
Always include proper error handling:
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # Raises an HTTPError for bad responses
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return []
Best Practices
- Respect robots.txt: Check the website's robots.txt file before scraping
- Add delays: Use time.sleep() between requests to avoid overwhelming servers
- Handle timeouts: Set reasonable timeout values for requests
- Use sessions: For multiple requests to the same domain, use
requests.Session()
- Validate URLs: Check if extracted URLs are valid before using them
- Consider legal implications: Ensure you have permission to scrape and use the images
Example: Complete Image Scraper Script
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time
def download_images(url, download_dir='images', delay=1):
"""
Complete example: Extract and download images from a webpage
"""
# Create download directory
os.makedirs(download_dir, exist_ok=True)
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
image_urls = []
# Extract image URLs
for img in soup.find_all('img'):
src = img.get('src') or img.get('data-src')
if src and not src.startswith('data:'):
full_url = urljoin(url, src)
image_urls.append(full_url)
print(f"Found {len(image_urls)} images")
# Download images
for i, img_url in enumerate(image_urls):
try:
img_response = requests.get(img_url, headers=headers, timeout=10)
img_response.raise_for_status()
# Get filename from URL
filename = os.path.basename(urlparse(img_url).path)
if not filename or '.' not in filename:
filename = f"image_{i+1}.jpg"
filepath = os.path.join(download_dir, filename)
with open(filepath, 'wb') as f:
f.write(img_response.content)
print(f"Downloaded: {filename}")
time.sleep(delay) # Be respectful
except Exception as e:
print(f"Failed to download {img_url}: {e}")
except requests.RequestException as e:
print(f"Error fetching webpage: {e}")
# Usage
if __name__ == "__main__":
url = "https://example.com"
download_images(url)
This comprehensive guide covers all the essential techniques for extracting image sources from webpages using Beautiful Soup, including handling modern web development patterns like lazy loading and CSS background images.