Beautiful Soup makes it easy to extract JavaScript and CSS resources from HTML documents by targeting specific tags. This guide covers extracting both inline and external scripts and stylesheets.
Installation
Install Beautiful Soup and required dependencies:
pip install beautifulsoup4 requests lxml
Extracting JavaScript
Basic Script Extraction
Extract all script tags from a webpage:
from bs4 import BeautifulSoup
import requests
# Fetch and parse webpage
url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
# Find all script tags
scripts = soup.find_all('script')
for i, script in enumerate(scripts):
if script.get('src'):
# External script file
print(f"External script {i+1}: {script['src']}")
elif script.string:
# Inline JavaScript
print(f"Inline script {i+1}:")
print(script.string.strip()[:100] + "...") # First 100 chars
Advanced Script Filtering
Filter scripts by type or other attributes:
# Only extract JavaScript files (not JSON-LD or other types)
js_scripts = soup.find_all('script', type='text/javascript')
# Or scripts without a type attribute (defaults to JavaScript)
default_scripts = soup.find_all('script', type=None)
# Filter by source patterns
cdn_scripts = soup.find_all('script', src=lambda x: x and 'cdn' in x)
# Extract specific script attributes
for script in scripts:
attributes = {
'src': script.get('src'),
'type': script.get('type'),
'async': script.get('async'),
'defer': script.get('defer'),
'integrity': script.get('integrity')
}
print(f"Script attributes: {attributes}")
Extracting CSS Stylesheets
Basic Stylesheet Extraction
Extract external CSS files:
# Find all CSS link tags
stylesheets = soup.find_all('link', rel='stylesheet')
for i, link in enumerate(stylesheets):
print(f"Stylesheet {i+1}: {link.get('href')}")
print(f" Media: {link.get('media', 'all')}")
print(f" Type: {link.get('type', 'text/css')}")
Inline Styles
Extract inline CSS from <style>
tags:
# Find all style tags
style_tags = soup.find_all('style')
for i, style in enumerate(style_tags):
if style.string:
print(f"Inline CSS {i+1}:")
print(style.string.strip()[:200] + "...") # First 200 chars
CSS Import Statements
Extract CSS imports from within style tags:
import re
for style in style_tags:
if style.string:
# Find @import statements
imports = re.findall(r'@import\s+(?:url\()?["\']?([^"\')\s]+)["\']?\)?', style.string)
for import_url in imports:
print(f"CSS Import: {import_url}")
Complete Example
Comprehensive script and stylesheet extraction:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse
def extract_resources(url):
"""Extract all JavaScript and CSS resources from a webpage."""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
resources = {
'scripts': {'inline': [], 'external': []},
'stylesheets': {'inline': [], 'external': [], 'imports': []}
}
# Extract JavaScript
for script in soup.find_all('script'):
if script.get('src'):
# Convert relative URLs to absolute
src = urljoin(url, script['src'])
resources['scripts']['external'].append({
'url': src,
'type': script.get('type'),
'async': script.has_attr('async'),
'defer': script.has_attr('defer')
})
elif script.string and script.string.strip():
resources['scripts']['inline'].append(script.string.strip())
# Extract CSS stylesheets
for link in soup.find_all('link', rel='stylesheet'):
href = urljoin(url, link.get('href', ''))
resources['stylesheets']['external'].append({
'url': href,
'media': link.get('media', 'all'),
'type': link.get('type', 'text/css')
})
# Extract inline styles
for style in soup.find_all('style'):
if style.string and style.string.strip():
content = style.string.strip()
resources['stylesheets']['inline'].append(content)
# Extract @import statements
imports = re.findall(r'@import\s+(?:url\()?["\']?([^"\')\s]+)["\']?\)?', content)
for import_url in imports:
resources['stylesheets']['imports'].append(urljoin(url, import_url))
return resources
# Usage
url = 'https://example.com'
resources = extract_resources(url)
print(f"Found {len(resources['scripts']['external'])} external scripts")
print(f"Found {len(resources['scripts']['inline'])} inline scripts")
print(f"Found {len(resources['stylesheets']['external'])} external stylesheets")
print(f"Found {len(resources['stylesheets']['inline'])} inline styles")
Advanced Techniques
Handling Dynamic Content
For JavaScript-heavy sites, combine with Selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
# Use Selenium to render JavaScript
driver = webdriver.Chrome()
driver.get('https://example.com')
html = driver.page_source
driver.quit()
# Parse with Beautiful Soup
soup = BeautifulSoup(html, 'lxml')
scripts = soup.find_all('script')
Resource Validation
Check if external resources are accessible:
def validate_resource(url):
"""Check if a resource URL is accessible."""
try:
response = requests.head(url, timeout=5)
return response.status_code == 200
except:
return False
# Validate external scripts
for script in resources['scripts']['external']:
if validate_resource(script['url']):
print(f"✓ {script['url']}")
else:
print(f"✗ {script['url']} (not accessible)")
Key Points
- Use
soup.find_all('script')
for JavaScript andsoup.find_all('link', rel='stylesheet')
for CSS - Check
script.get('src')
for external files andscript.string
for inline content - Convert relative URLs to absolute using
urllib.parse.urljoin()
- Handle both
<style>
tags and CSS@import
statements for complete stylesheet extraction - Consider using Selenium for JavaScript-rendered content
- Always respect robots.txt and website terms of service when scraping