Selenium is powerful for extracting images from websites, especially those that load content dynamically with JavaScript. This guide shows you how to extract image URLs and download images using Python and JavaScript.
Python Implementation
Method 1: Using Selenium Only
The simplest approach uses Selenium's built-in element finding methods:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin
import os
# Configure Chrome options (headless mode for production)
chrome_options = Options()
chrome_options.add_argument("--headless") # Remove for debugging
# Initialize the driver
driver = webdriver.Chrome(options=chrome_options)
try:
# Navigate to the target website
driver.get("https://example.com")
# Wait for page to load completely
driver.implicitly_wait(10)
# Find all image elements
img_elements = driver.find_elements(By.TAG_NAME, "img")
# Extract image URLs
img_urls = []
for img in img_elements:
src = img.get_attribute("src")
if src:
# Convert relative URLs to absolute URLs
absolute_url = urljoin(driver.current_url, src)
img_urls.append(absolute_url)
print(f"Found image: {absolute_url}")
finally:
driver.quit()
Method 2: Using Selenium with BeautifulSoup
For more complex HTML parsing, combine Selenium with BeautifulSoup:
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import os
driver = webdriver.Chrome()
try:
driver.get("https://example.com")
# Get page source after JavaScript execution
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all img tags
img_tags = soup.find_all('img')
# Extract image URLs with additional attributes
for img in img_tags:
src = img.get('src')
alt_text = img.get('alt', 'No alt text')
if src:
absolute_url = urljoin(driver.current_url, src)
print(f"Image: {absolute_url} | Alt: {alt_text}")
finally:
driver.quit()
Method 3: Complete Image Download Example
Here's a comprehensive example that downloads images to your local machine:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin, urlparse
import os
import time
def download_images_from_website(url, download_folder="images"):
# Create download folder
os.makedirs(download_folder, exist_ok=True)
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
time.sleep(3) # Wait for dynamic content
# Find all image elements
img_elements = driver.find_elements(By.TAG_NAME, "img")
print(f"Found {len(img_elements)} images")
for i, img in enumerate(img_elements):
try:
src = img.get_attribute("src")
if not src:
continue
# Convert to absolute URL
img_url = urljoin(driver.current_url, src)
# Download the image
response = requests.get(img_url, timeout=10)
response.raise_for_status()
# Generate filename
parsed_url = urlparse(img_url)
filename = os.path.basename(parsed_url.path)
if not filename or '.' not in filename:
filename = f"image_{i}.jpg"
# Save the image
filepath = os.path.join(download_folder, filename)
with open(filepath, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {filename}")
time.sleep(1) # Be respectful to the server
except Exception as e:
print(f"Error downloading image {i}: {e}")
finally:
driver.quit()
# Usage
download_images_from_website("https://example.com")
JavaScript Implementation
Basic Image Extraction
const {Builder, By, until} = require('selenium-webdriver');
const chrome = require('selenium-webdriver/chrome');
async function extractImages(url) {
// Configure Chrome options
let options = new chrome.Options();
options.addArguments('--headless'); // Remove for debugging
let driver = await new Builder()
.forBrowser('chrome')
.setChromeOptions(options)
.build();
try {
await driver.get(url);
// Wait for page to load
await driver.wait(until.titleIs, 10000);
// Find all image elements
let imgElements = await driver.findElements(By.css('img'));
let imageUrls = [];
for (let img of imgElements) {
let src = await img.getAttribute('src');
if (src) {
// Convert relative URLs to absolute
let absoluteUrl = new URL(src, await driver.getCurrentUrl()).href;
imageUrls.push(absoluteUrl);
console.log(`Found image: ${absoluteUrl}`);
}
}
return imageUrls;
} finally {
await driver.quit();
}
}
// Usage
extractImages('https://example.com')
.then(urls => console.log(`Extracted ${urls.length} images`))
.catch(console.error);
Advanced Image Extraction with Metadata
const {Builder, By} = require('selenium-webdriver');
const fs = require('fs');
const https = require('https');
const path = require('path');
async function extractImagesWithMetadata(url, downloadFolder = 'downloads') {
let driver = await new Builder().forBrowser('chrome').build();
try {
await driver.get(url);
// Wait for dynamic content
await driver.sleep(3000);
let imgElements = await driver.findElements(By.css('img'));
let imageData = [];
for (let img of imgElements) {
try {
let src = await img.getAttribute('src');
let alt = await img.getAttribute('alt') || 'No alt text';
let width = await img.getAttribute('width') || 'auto';
let height = await img.getAttribute('height') || 'auto';
if (src) {
let absoluteUrl = new URL(src, await driver.getCurrentUrl()).href;
imageData.push({
url: absoluteUrl,
alt: alt,
width: width,
height: height
});
}
} catch (error) {
console.log(`Error processing image: ${error.message}`);
}
}
// Save metadata to JSON
fs.writeFileSync('image_metadata.json', JSON.stringify(imageData, null, 2));
console.log(`Extracted metadata for ${imageData.length} images`);
return imageData;
} finally {
await driver.quit();
}
}
// Usage
extractImagesWithMetadata('https://example.com');
Handling Special Cases
1. Lazy-Loaded Images
Many websites use lazy loading. Handle this by scrolling:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome()
driver.get("https://example.com")
# Scroll to load lazy images
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Now extract images
img_elements = driver.find_elements(By.TAG_NAME, "img")
2. Background Images in CSS
Extract background images from CSS:
# Find elements with background images
elements_with_bg = driver.find_elements(By.XPATH, "//*[contains(@style, 'background-image')]")
for element in elements_with_bg:
style = element.get_attribute("style")
# Extract URL from background-image: url('...')
import re
urls = re.findall(r'background-image:\s*url\(["\']?([^"\']+)["\']?\)', style)
for url in urls:
print(f"Background image: {url}")
3. Images in Iframes
Handle images inside iframes:
# Switch to iframe
iframe = driver.find_element(By.TAG_NAME, "iframe")
driver.switch_to.frame(iframe)
# Extract images from iframe
img_elements = driver.find_elements(By.TAG_NAME, "img")
# Switch back to main content
driver.switch_to.default_content()
Best Practices and Considerations
1. Performance Optimization
- Use headless mode for production
- Set appropriate timeouts
- Implement proper error handling
- Add delays between requests
2. Ethical Scraping
import time
import random
def respectful_delay():
"""Add random delay between requests"""
time.sleep(random.uniform(1, 3))
# Check robots.txt
def check_robots_txt(base_url):
try:
robots_url = urljoin(base_url, '/robots.txt')
response = requests.get(robots_url)
return response.text
except:
return None
3. Error Handling
from selenium.common.exceptions import TimeoutException, NoSuchElementException
try:
driver.get(url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "img"))
)
except TimeoutException:
print("Page took too long to load")
except NoSuchElementException:
print("No images found on the page")
4. Memory Management
# Always close the driver
try:
# Your scraping code here
pass
finally:
driver.quit() # Ensures browser closes even if errors occur
Installation Requirements
Python Dependencies
pip install selenium beautifulsoup4 requests pillow
JavaScript Dependencies
npm install selenium-webdriver
WebDriver Installation
Download the appropriate WebDriver: - ChromeDriver - GeckoDriver for Firefox
Or use WebDriver Manager for Python:
pip install webdriver-manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
Remember to always respect website terms of service, implement rate limiting, and consider the server load when scraping images. For large-scale operations, consider using dedicated web scraping APIs like WebScraping.AI.