How do I use Selenium to extract images from a website?

Selenium is powerful for extracting images from websites, especially those that load content dynamically with JavaScript. This guide shows you how to extract image URLs and download images using Python and JavaScript.

Python Implementation

Method 1: Using Selenium Only

The simplest approach uses Selenium's built-in element finding methods:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin
import os

# Configure Chrome options (headless mode for production)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Remove for debugging

# Initialize the driver
driver = webdriver.Chrome(options=chrome_options)

try:
    # Navigate to the target website
    driver.get("https://example.com")

    # Wait for page to load completely
    driver.implicitly_wait(10)

    # Find all image elements
    img_elements = driver.find_elements(By.TAG_NAME, "img")

    # Extract image URLs
    img_urls = []
    for img in img_elements:
        src = img.get_attribute("src")
        if src:
            # Convert relative URLs to absolute URLs
            absolute_url = urljoin(driver.current_url, src)
            img_urls.append(absolute_url)
            print(f"Found image: {absolute_url}")

finally:
    driver.quit()

Method 2: Using Selenium with BeautifulSoup

For more complex HTML parsing, combine Selenium with BeautifulSoup:

from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import os

driver = webdriver.Chrome()

try:
    driver.get("https://example.com")

    # Get page source after JavaScript execution
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all img tags
    img_tags = soup.find_all('img')

    # Extract image URLs with additional attributes
    for img in img_tags:
        src = img.get('src')
        alt_text = img.get('alt', 'No alt text')

        if src:
            absolute_url = urljoin(driver.current_url, src)
            print(f"Image: {absolute_url} | Alt: {alt_text}")

finally:
    driver.quit()

Method 3: Complete Image Download Example

Here's a comprehensive example that downloads images to your local machine:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import requests
from urllib.parse import urljoin, urlparse
import os
import time

def download_images_from_website(url, download_folder="images"):
    # Create download folder
    os.makedirs(download_folder, exist_ok=True)

    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        time.sleep(3)  # Wait for dynamic content

        # Find all image elements
        img_elements = driver.find_elements(By.TAG_NAME, "img")

        print(f"Found {len(img_elements)} images")

        for i, img in enumerate(img_elements):
            try:
                src = img.get_attribute("src")
                if not src:
                    continue

                # Convert to absolute URL
                img_url = urljoin(driver.current_url, src)

                # Download the image
                response = requests.get(img_url, timeout=10)
                response.raise_for_status()

                # Generate filename
                parsed_url = urlparse(img_url)
                filename = os.path.basename(parsed_url.path)
                if not filename or '.' not in filename:
                    filename = f"image_{i}.jpg"

                # Save the image
                filepath = os.path.join(download_folder, filename)
                with open(filepath, 'wb') as f:
                    f.write(response.content)

                print(f"Downloaded: {filename}")
                time.sleep(1)  # Be respectful to the server

            except Exception as e:
                print(f"Error downloading image {i}: {e}")

    finally:
        driver.quit()

# Usage
download_images_from_website("https://example.com")

JavaScript Implementation

Basic Image Extraction

const {Builder, By, until} = require('selenium-webdriver');
const chrome = require('selenium-webdriver/chrome');

async function extractImages(url) {
    // Configure Chrome options
    let options = new chrome.Options();
    options.addArguments('--headless'); // Remove for debugging

    let driver = await new Builder()
        .forBrowser('chrome')
        .setChromeOptions(options)
        .build();

    try {
        await driver.get(url);

        // Wait for page to load
        await driver.wait(until.titleIs, 10000);

        // Find all image elements
        let imgElements = await driver.findElements(By.css('img'));

        let imageUrls = [];
        for (let img of imgElements) {
            let src = await img.getAttribute('src');
            if (src) {
                // Convert relative URLs to absolute
                let absoluteUrl = new URL(src, await driver.getCurrentUrl()).href;
                imageUrls.push(absoluteUrl);
                console.log(`Found image: ${absoluteUrl}`);
            }
        }

        return imageUrls;

    } finally {
        await driver.quit();
    }
}

// Usage
extractImages('https://example.com')
    .then(urls => console.log(`Extracted ${urls.length} images`))
    .catch(console.error);

Advanced Image Extraction with Metadata

const {Builder, By} = require('selenium-webdriver');
const fs = require('fs');
const https = require('https');
const path = require('path');

async function extractImagesWithMetadata(url, downloadFolder = 'downloads') {
    let driver = await new Builder().forBrowser('chrome').build();

    try {
        await driver.get(url);

        // Wait for dynamic content
        await driver.sleep(3000);

        let imgElements = await driver.findElements(By.css('img'));
        let imageData = [];

        for (let img of imgElements) {
            try {
                let src = await img.getAttribute('src');
                let alt = await img.getAttribute('alt') || 'No alt text';
                let width = await img.getAttribute('width') || 'auto';
                let height = await img.getAttribute('height') || 'auto';

                if (src) {
                    let absoluteUrl = new URL(src, await driver.getCurrentUrl()).href;

                    imageData.push({
                        url: absoluteUrl,
                        alt: alt,
                        width: width,
                        height: height
                    });
                }
            } catch (error) {
                console.log(`Error processing image: ${error.message}`);
            }
        }

        // Save metadata to JSON
        fs.writeFileSync('image_metadata.json', JSON.stringify(imageData, null, 2));
        console.log(`Extracted metadata for ${imageData.length} images`);

        return imageData;

    } finally {
        await driver.quit();
    }
}

// Usage
extractImagesWithMetadata('https://example.com');

Handling Special Cases

1. Lazy-Loaded Images

Many websites use lazy loading. Handle this by scrolling:

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome()
driver.get("https://example.com")

# Scroll to load lazy images
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Now extract images
img_elements = driver.find_elements(By.TAG_NAME, "img")

2. Background Images in CSS

Extract background images from CSS:

# Find elements with background images
elements_with_bg = driver.find_elements(By.XPATH, "//*[contains(@style, 'background-image')]")

for element in elements_with_bg:
    style = element.get_attribute("style")
    # Extract URL from background-image: url('...')
    import re
    urls = re.findall(r'background-image:\s*url\(["\']?([^"\']+)["\']?\)', style)
    for url in urls:
        print(f"Background image: {url}")

3. Images in Iframes

Handle images inside iframes:

# Switch to iframe
iframe = driver.find_element(By.TAG_NAME, "iframe")
driver.switch_to.frame(iframe)

# Extract images from iframe
img_elements = driver.find_elements(By.TAG_NAME, "img")

# Switch back to main content
driver.switch_to.default_content()

Best Practices and Considerations

1. Performance Optimization

  • Use headless mode for production
  • Set appropriate timeouts
  • Implement proper error handling
  • Add delays between requests

2. Ethical Scraping

import time
import random

def respectful_delay():
    """Add random delay between requests"""
    time.sleep(random.uniform(1, 3))

# Check robots.txt
def check_robots_txt(base_url):
    try:
        robots_url = urljoin(base_url, '/robots.txt')
        response = requests.get(robots_url)
        return response.text
    except:
        return None

3. Error Handling

from selenium.common.exceptions import TimeoutException, NoSuchElementException

try:
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "img"))
    )
except TimeoutException:
    print("Page took too long to load")
except NoSuchElementException:
    print("No images found on the page")

4. Memory Management

# Always close the driver
try:
    # Your scraping code here
    pass
finally:
    driver.quit()  # Ensures browser closes even if errors occur

Installation Requirements

Python Dependencies

pip install selenium beautifulsoup4 requests pillow

JavaScript Dependencies

npm install selenium-webdriver

WebDriver Installation

Download the appropriate WebDriver: - ChromeDriver - GeckoDriver for Firefox

Or use WebDriver Manager for Python:

pip install webdriver-manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

Remember to always respect website terms of service, implement rate limiting, and consider the server load when scraping images. For large-scale operations, consider using dedicated web scraping APIs like WebScraping.AI.

Related Questions

Get Started Now

WebScraping.AI provides rotating proxies, Chromium rendering and built-in HTML parser for web scraping
Icon