What are the best practices for handling timeouts in JavaScript scraping?
Handling timeouts effectively is crucial for building robust and reliable JavaScript web scrapers. Timeouts prevent your scraping scripts from hanging indefinitely when pages load slowly, network requests fail, or elements don't appear as expected. This comprehensive guide covers the best practices for implementing timeout strategies in JavaScript scraping applications.
Understanding Different Types of Timeouts
Page Load Timeouts
Page load timeouts control how long your scraper waits for a page to fully load before proceeding or throwing an error.
// Puppeteer example
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Set default navigation timeout to 30 seconds
page.setDefaultNavigationTimeout(30000);
try {
await page.goto('https://example.com', {
waitUntil: 'networkidle2',
timeout: 30000
});
} catch (error) {
console.error('Page load timeout:', error.message);
}
Element Wait Timeouts
Element wait timeouts specify how long to wait for specific elements to appear or become available.
// Waiting for an element with timeout
try {
await page.waitForSelector('.dynamic-content', {
timeout: 15000
});
const content = await page.$eval('.dynamic-content', el => el.textContent);
console.log('Content loaded:', content);
} catch (error) {
console.error('Element timeout:', error.message);
}
Network Request Timeouts
Network request timeouts control how long individual HTTP requests can take.
// Using axios with timeout
const axios = require('axios');
const response = await axios.get('https://api.example.com/data', {
timeout: 10000, // 10 seconds
headers: {
'User-Agent': 'Mozilla/5.0...'
}
});
Best Practices for Timeout Configuration
1. Set Appropriate Timeout Values
Choose timeout values based on your target websites and use cases:
// Different timeout strategies for different scenarios
const timeoutConfig = {
fastSites: {
navigation: 15000, // 15 seconds
element: 5000, // 5 seconds
network: 8000 // 8 seconds
},
slowSites: {
navigation: 60000, // 60 seconds
element: 20000, // 20 seconds
network: 30000 // 30 seconds
},
apiEndpoints: {
navigation: 10000, // 10 seconds
element: 3000, // 3 seconds
network: 5000 // 5 seconds
}
};
// Apply configuration based on site type
function configureTimeouts(page, siteType) {
const config = timeoutConfig[siteType];
page.setDefaultNavigationTimeout(config.navigation);
page.setDefaultTimeout(config.element);
}
2. Implement Progressive Timeout Strategies
Use progressive timeouts that increase with retry attempts:
async function scrapeWithProgressiveTimeout(url, maxRetries = 3) {
const baseTimeout = 15000; // 15 seconds
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const timeout = baseTimeout * attempt; // Increase timeout each retry
const page = await browser.newPage();
page.setDefaultNavigationTimeout(timeout);
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: timeout
});
const data = await extractData(page);
await page.close();
return data;
} catch (error) {
console.log(`Attempt ${attempt} failed:`, error.message);
if (attempt === maxRetries) {
throw new Error(`Failed after ${maxRetries} attempts`);
}
// Wait before retry
await new Promise(resolve => setTimeout(resolve, 2000 * attempt));
}
}
}
3. Use Conditional Timeouts
Adjust timeouts based on page characteristics or content type:
async function adaptiveTimeout(page, url) {
// Check if it's a heavy JavaScript application
const isComplexApp = url.includes('app') || url.includes('dashboard');
// Check if it's an API endpoint
const isApi = url.includes('/api/') || url.includes('.json');
if (isComplexApp) {
page.setDefaultNavigationTimeout(45000);
page.setDefaultTimeout(20000);
} else if (isApi) {
page.setDefaultNavigationTimeout(10000);
page.setDefaultTimeout(5000);
} else {
page.setDefaultNavigationTimeout(30000);
page.setDefaultTimeout(15000);
}
}
Advanced Timeout Handling Techniques
1. Custom Wait Functions with Timeouts
Create custom wait functions that combine multiple conditions:
async function waitForCompleteLoad(page, timeout = 30000) {
const startTime = Date.now();
while (Date.now() - startTime < timeout) {
try {
// Wait for network to be idle
await page.waitForLoadState('networkidle', { timeout: 5000 });
// Wait for specific indicators that content is loaded
await page.waitForFunction(() => {
return document.readyState === 'complete' &&
document.querySelectorAll('.loading').length === 0 &&
!document.querySelector('.spinner');
}, { timeout: 5000 });
return true; // Successfully loaded
} catch (error) {
// Continue waiting if timeout was brief
if (Date.now() - startTime >= timeout) {
throw new Error('Complete load timeout exceeded');
}
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
}
2. Timeout with Fallback Strategies
Implement fallback strategies when timeouts occur:
async function scrapeWithFallback(page, selectors, timeout = 15000) {
const primarySelector = selectors[0];
const fallbackSelectors = selectors.slice(1);
try {
// Try primary selector first
await page.waitForSelector(primarySelector, { timeout });
return await page.$eval(primarySelector, el => el.textContent);
} catch (timeoutError) {
console.log('Primary selector timed out, trying fallbacks...');
// Try fallback selectors
for (const selector of fallbackSelectors) {
try {
await page.waitForSelector(selector, { timeout: timeout / 2 });
return await page.$eval(selector, el => el.textContent);
} catch (fallbackError) {
console.log(`Fallback selector ${selector} also failed`);
}
}
// If all selectors fail, return partial content
const pageContent = await page.content();
return extractContentFromHTML(pageContent);
}
}
3. Race Conditions and Timeout Handling
Handle race conditions between different operations:
async function handleTimeoutRace(page, url) {
try {
// Race between navigation and timeout
await Promise.race([
page.goto(url, { waitUntil: 'networkidle2' }),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Custom timeout')), 25000)
)
]);
// Race between element appearance and timeout
const element = await Promise.race([
page.waitForSelector('.target-element'),
page.waitForSelector('.error-message'),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Element timeout')), 10000)
)
]);
return element;
} catch (error) {
console.error('Race condition timeout:', error.message);
throw error;
}
}
Timeout Error Handling and Recovery
1. Graceful Timeout Error Handling
class TimeoutError extends Error {
constructor(operation, timeout, url) {
super(`${operation} timed out after ${timeout}ms for ${url}`);
this.name = 'TimeoutError';
this.operation = operation;
this.timeout = timeout;
this.url = url;
}
}
async function handleTimeoutGracefully(operation, timeout, url) {
try {
return await operation();
} catch (error) {
if (error.name === 'TimeoutError' || error.message.includes('timeout')) {
throw new TimeoutError(operation.name, timeout, url);
}
throw error;
}
}
2. Timeout Monitoring and Logging
class TimeoutMonitor {
constructor() {
this.timeouts = new Map();
this.stats = {
total: 0,
timeouts: 0,
avgDuration: 0
};
}
startTimer(operationId, operation, timeout) {
const startTime = Date.now();
this.timeouts.set(operationId, { operation, timeout, startTime });
this.stats.total++;
}
endTimer(operationId, success = true) {
const data = this.timeouts.get(operationId);
if (!data) return;
const duration = Date.now() - data.startTime;
if (!success || duration >= data.timeout) {
this.stats.timeouts++;
console.warn(`Timeout for ${data.operation}: ${duration}ms (limit: ${data.timeout}ms)`);
}
this.updateAverages(duration);
this.timeouts.delete(operationId);
}
updateAverages(duration) {
this.stats.avgDuration = (this.stats.avgDuration + duration) / 2;
}
getStats() {
return {
...this.stats,
timeoutRate: (this.stats.timeouts / this.stats.total) * 100
};
}
}
Framework-Specific Best Practices
Puppeteer Timeout Configuration
When working with Puppeteer for handling timeouts, use these specific configurations:
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
// Global browser timeout settings
timeout: 60000, // Browser launch timeout
});
const page = await browser.newPage();
// Set comprehensive timeout configuration
page.setDefaultNavigationTimeout(30000);
page.setDefaultTimeout(15000);
// Use specific timeouts for different operations
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
await page.waitForSelector('.content', {
timeout: 20000,
visible: true
});
Playwright Timeout Configuration
const { chromium } = require('playwright');
const browser = await chromium.launch();
const context = await browser.newContext({
// Set global timeout for the context
timeout: 30000
});
const page = await context.newPage();
// Set page-specific timeouts
page.setDefaultTimeout(15000);
page.setDefaultNavigationTimeout(30000);
// Use expect with timeout for assertions
await expect(page.locator('.result')).toBeVisible({ timeout: 20000 });
Performance Optimization with Timeouts
1. Concurrent Scraping with Timeout Management
async function scrapeMultiplePages(urls, concurrency = 5) {
const results = [];
const semaphore = new Semaphore(concurrency);
const promises = urls.map(async (url) => {
await semaphore.acquire();
try {
const page = await browser.newPage();
// Set conservative timeouts for concurrent operations
page.setDefaultNavigationTimeout(20000);
page.setDefaultTimeout(10000);
const result = await scrapePageWithTimeout(page, url);
await page.close();
return result;
} catch (error) {
console.error(`Failed to scrape ${url}:`, error.message);
return null;
} finally {
semaphore.release();
}
});
const results = await Promise.allSettled(promises);
return results.map(result => result.status === 'fulfilled' ? result.value : null);
}
2. Adaptive Timeout Based on Performance
class AdaptiveTimeoutManager {
constructor() {
this.performanceHistory = [];
this.baseTimeout = 15000;
}
recordPerformance(url, duration, success) {
this.performanceHistory.push({ url, duration, success, timestamp: Date.now() });
// Keep only recent history (last 100 operations)
if (this.performanceHistory.length > 100) {
this.performanceHistory = this.performanceHistory.slice(-100);
}
}
calculateOptimalTimeout(url) {
const domain = new URL(url).hostname;
const domainHistory = this.performanceHistory
.filter(entry => entry.url.includes(domain))
.slice(-10); // Last 10 operations for this domain
if (domainHistory.length === 0) {
return this.baseTimeout;
}
const avgDuration = domainHistory.reduce((sum, entry) => sum + entry.duration, 0) / domainHistory.length;
const successRate = domainHistory.filter(entry => entry.success).length / domainHistory.length;
// Adjust timeout based on historical performance
let timeout = Math.max(avgDuration * 2, this.baseTimeout);
// Increase timeout for unreliable domains
if (successRate < 0.8) {
timeout *= 1.5;
}
return Math.min(timeout, 60000); // Cap at 60 seconds
}
}
Monitoring and Debugging Timeouts
1. Timeout Analytics
function createTimeoutAnalytics() {
const analytics = {
operations: [],
logTimeout(operation, url, duration, limit, error) {
this.operations.push({
operation,
url,
duration,
limit,
timedOut: duration >= limit,
error: error?.message,
timestamp: new Date().toISOString()
});
},
generateReport() {
const timeouts = this.operations.filter(op => op.timedOut);
const avgDuration = this.operations.reduce((sum, op) => sum + op.duration, 0) / this.operations.length;
return {
totalOperations: this.operations.length,
timeoutCount: timeouts.length,
timeoutRate: (timeouts.length / this.operations.length) * 100,
averageDuration: avgDuration,
slowestOperations: this.operations
.sort((a, b) => b.duration - a.duration)
.slice(0, 5)
};
}
};
return analytics;
}
2. Real-time Timeout Monitoring
function setupTimeoutMonitoring() {
const monitor = {
activeOperations: new Map(),
startOperation(id, operation, timeout) {
this.activeOperations.set(id, {
operation,
timeout,
startTime: Date.now(),
timer: setTimeout(() => {
console.warn(`⚠️ Operation ${operation} (${id}) is taking longer than expected (${timeout}ms)`);
}, timeout * 0.8) // Warn at 80% of timeout
});
},
endOperation(id, success = true) {
const op = this.activeOperations.get(id);
if (op) {
clearTimeout(op.timer);
const duration = Date.now() - op.startTime;
if (success) {
console.log(`✅ ${op.operation} completed in ${duration}ms`);
} else {
console.error(`❌ ${op.operation} failed after ${duration}ms`);
}
this.activeOperations.delete(id);
}
}
};
return monitor;
}
Conclusion
Effective timeout handling is essential for building reliable JavaScript web scrapers. By implementing progressive timeout strategies, using appropriate timeout values for different scenarios, and monitoring timeout performance, you can create robust scrapers that handle various network conditions and website behaviors gracefully.
Remember to: - Set reasonable timeout values based on your target websites - Implement retry logic with progressive timeouts - Use fallback strategies when primary selectors timeout - Monitor and analyze timeout patterns to optimize performance - Test your timeout configurations under different network conditions
For more specific guidance on timeout handling with different tools, check out our guides on handling AJAX requests using Puppeteer and using the waitFor function in Puppeteer for more advanced waiting strategies.