To extract all links from an HTML page using Python's Requests library, you need to combine it with an HTML parser like BeautifulSoup. Requests fetches the webpage, while BeautifulSoup parses the HTML to find and extract link elements.
Installation
First, install the required libraries:
pip install requests beautifulsoup4
Basic Link Extraction
Here's a simple example to extract all links from a webpage:
import requests
from bs4 import BeautifulSoup
# Fetch the webpage
url = "https://example.com"
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all anchor tags
links = soup.find_all('a', href=True)
# Extract and print all href attributes
for link in links:
print(link['href'])
else:
print(f"Failed to fetch the page. Status code: {response.status_code}")
Enhanced Link Extraction with Filtering
A more robust approach that filters out empty links and handles different URL types:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def extract_links(url):
try:
# Send GET request with headers to avoid blocking
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links with href attributes
links = soup.find_all('a', href=True)
extracted_links = []
for link in links:
href = link['href'].strip()
# Skip empty links and anchors
if not href or href.startswith('#'):
continue
# Convert relative URLs to absolute URLs
absolute_url = urljoin(url, href)
# Validate URL format
parsed = urlparse(absolute_url)
if parsed.scheme in ['http', 'https']:
extracted_links.append({
'url': absolute_url,
'text': link.get_text(strip=True),
'title': link.get('title', '')
})
return extracted_links
except requests.RequestException as e:
print(f"Error fetching the page: {e}")
return []
# Usage example
url = "https://example.com"
links = extract_links(url)
for link in links:
print(f"URL: {link['url']}")
print(f"Text: {link['text']}")
print(f"Title: {link['title']}")
print("-" * 50)
Filtering Links by Type
Extract specific types of links:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_external_links(url):
"""Extract only external links (different domain)"""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
base_domain = urlparse(url).netloc
external_links = []
for link in soup.find_all('a', href=True):
href = urljoin(url, link['href'])
link_domain = urlparse(href).netloc
if link_domain and link_domain != base_domain:
external_links.append(href)
return external_links
def get_internal_links(url):
"""Extract only internal links (same domain)"""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
base_domain = urlparse(url).netloc
internal_links = []
for link in soup.find_all('a', href=True):
href = urljoin(url, link['href'])
link_domain = urlparse(href).netloc
if not link_domain or link_domain == base_domain:
internal_links.append(href)
return internal_links
Handling Different Link Types
Extract various types of links from HTML:
import requests
from bs4 import BeautifulSoup
def extract_all_link_types(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = {
'anchor_links': [urljoin(url, a['href']) for a in soup.find_all('a', href=True)],
'image_links': [urljoin(url, img['src']) for img in soup.find_all('img', src=True)],
'css_links': [urljoin(url, link['href']) for link in soup.find_all('link', href=True)],
'script_links': [urljoin(url, script['src']) for script in soup.find_all('script', src=True)]
}
return links
# Usage
url = "https://example.com"
all_links = extract_all_link_types(url)
for link_type, urls in all_links.items():
print(f"{link_type.title()}: {len(urls)} found")
for url in urls[:5]: # Show first 5 of each type
print(f" - {url}")
Error Handling and Best Practices
A production-ready example with comprehensive error handling:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
def extract_links_robust(url, max_retries=3, delay=1):
"""
Extract links with retry logic and comprehensive error handling
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
for attempt in range(max_retries):
try:
response = requests.get(
url,
headers=headers,
timeout=10,
allow_redirects=True
)
response.raise_for_status()
# Parse with explicit encoding handling
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href'].strip()
if href and not href.startswith(('#', 'javascript:', 'mailto:')):
absolute_url = urljoin(url, href)
links.append({
'url': absolute_url,
'text': a_tag.get_text(strip=True)[:100], # Limit text length
'rel': a_tag.get('rel', []),
'target': a_tag.get('target', '')
})
return links
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(delay * (attempt + 1)) # Exponential backoff
else:
print(f"Failed to extract links after {max_retries} attempts")
return []
# Usage
links = extract_links_robust("https://example.com")
print(f"Extracted {len(links)} links")
Key Points
- Requests handles HTTP: Fetches the webpage content
- BeautifulSoup parses HTML: Finds and extracts link elements
- URL handling: Use
urljoin()
to convert relative URLs to absolute URLs - Error handling: Always check response status and handle exceptions
- Filtering: Remove empty links, anchors, and invalid URLs
- Headers: Include User-Agent to avoid being blocked by some websites
- Timeouts: Set timeouts to prevent hanging requests
This approach gives you complete control over link extraction and allows for sophisticated filtering and processing of the extracted URLs.