${result.markdown}`;
return fullContent; } ```
3. Content Monitoring
Monitor website changes by comparing Markdown snapshots:
import hashlib
from firecrawl import FirecrawlApp
def monitor_content_changes(url, previous_hash=None):
app = FirecrawlApp(api_key='your_api_key')
result = app.scrape_url(url, params={'formats': ['markdown']})
markdown = result['markdown']
# Calculate content hash
current_hash = hashlib.md5(markdown.encode()).hexdigest()
# Check if content changed
if previous_hash and current_hash != previous_hash:
print(f"Content changed on {url}")
return markdown, current_hash, True
return markdown, current_hash, False
Comparing Firecrawl to Manual HTML Conversion
Without Firecrawl (Manual Approach)
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
# Manual process - multiple steps
response = requests.get('https://example.com')
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for script in soup(['script', 'style']):
script.decompose()
# Convert to Markdown
markdown = md(str(soup), heading_style="ATX")
With Firecrawl (Simplified)
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key='your_api_key')
result = app.scrape_url('https://example.com')
markdown = result['markdown'] # Done!
Handling Complex Scenarios
Tables and Structured Data
Firecrawl converts HTML tables to Markdown tables automatically:
# Scrape a page with tables
result = app.scrape_url('https://example.com/data-table')
# The Markdown will include properly formatted tables
# Example output:
# | Header 1 | Header 2 | Header 3 |
# |----------|----------|----------|
# | Cell 1 | Cell 2 | Cell 3 |
Code Blocks
Code snippets are preserved with syntax highlighting hints:
// Firecrawl maintains code formatting
const result = await app.scrapeUrl('https://docs.example.com/code-examples');
// Markdown output preserves code blocks:
// ```language-javascript
// function example() {
// return 'Hello World';
// }
// ```
Nested Lists and Quotes
Complex HTML structures like nested lists and blockquotes are properly converted:
# Original HTML:
# <blockquote>
# <ul>
# <li>Item 1
# <ul>
# <li>Nested item</li>
# </ul>
# </li>
# </ul>
# </blockquote>
# Converted Markdown:
# > - Item 1
# > - Nested item
Best Practices
1. Extract Main Content Only
Use onlyMainContent: True
to avoid navigation, footers, and sidebars:
result = app.scrape_url(url, params={'onlyMainContent': True})
2. Handle JavaScript-Heavy Sites
For single-page applications, ensure adequate wait time, similar to crawling SPAs using Puppeteer:
const result = await app.scrapeUrl(spaUrl, {
formats: ['markdown'],
waitFor: 5000, // Wait for JavaScript rendering
timeout: 45000
});
3. Batch Processing
When scraping multiple pages, use crawling functionality:
# Crawl multiple pages efficiently
result = app.crawl_url(
'https://example.com',
params={
'limit': 10,
'scrapeOptions': {'formats': ['markdown']}
}
)
# Process each page's Markdown
for page in result['data']:
markdown = page['markdown']
# Process markdown...
4. Error Handling
Always implement proper error handling:
from firecrawl import FirecrawlApp
def safe_markdown_conversion(url):
app = FirecrawlApp(api_key='your_api_key')
try:
result = app.scrape_url(url, params={'formats': ['markdown']})
return result['markdown']
except Exception as e:
print(f"Error converting {url}: {str(e)}")
return None
Performance Considerations
Response Time
Firecrawl's conversion adds minimal overhead:
- Static pages: 1-3 seconds
- JavaScript-rendered pages: 3-10 seconds (depending on waitFor
settings)
- Complex pages: 5-15 seconds
Memory Usage
The Markdown output is typically 40-60% smaller than the original HTML, making it more efficient for: - Storage - Processing - Transmission - AI model inputs
Rate Limiting
Be mindful of API rate limits when processing multiple URLs:
// Process URLs with rate limiting
async function processUrlsWithDelay(urls, delayMs = 1000) {
const results = [];
for (const url of urls) {
try {
const result = await app.scrapeUrl(url, {
formats: ['markdown']
});
results.push(result);
// Wait between requests
await new Promise(resolve => setTimeout(resolve, delayMs));
} catch (error) {
console.error(`Failed to process ${url}:`, error);
}
}
return results;
}
Integration with AI and LLMs
Firecrawl's Markdown output is perfect for AI applications:
from firecrawl import FirecrawlApp
import openai
def analyze_webpage_content(url):
# Get clean Markdown from Firecrawl
app = FirecrawlApp(api_key='firecrawl_key')
result = app.scrape_url(url, params={'formats': ['markdown']})
# Send to LLM for analysis
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[{
"role": "user",
"content": f"Analyze this content:\n\n{result['markdown']}"
}]
)
return response.choices[0].message.content
Conclusion
Firecrawl's HTML to Markdown conversion simplifies web scraping workflows by providing clean, structured content output. The underlying Turndown library ensures high-quality conversions while Firecrawl handles the complexity of modern web pages, including JavaScript rendering and content extraction.
Whether you're building documentation systems, migrating content, or feeding data to AI models, Firecrawl's Markdown conversion feature offers a robust, developer-friendly solution that eliminates the need for manual HTML parsing and conversion logic.
For more advanced web scraping scenarios, consider exploring how to interact with DOM elements in Puppeteer for custom extraction needs beyond standard Markdown conversion.