How do I Extract Data Using the MCP API?
Extracting data using the Model Context Protocol (MCP) API involves creating MCP servers that expose data extraction tools to AI assistants like Claude. Unlike traditional web scraping where you write explicit extraction logic, MCP enables AI-powered data extraction through natural language interactions with your custom tools and external scraping APIs.
This guide walks you through implementing data extraction using MCP, from basic HTML retrieval to complex structured data extraction with AI-powered field recognition.
Understanding MCP Data Extraction Flow
The MCP data extraction workflow follows this pattern:
┌──────────────┐ ┌─────────────┐ ┌──────────────────┐
│ AI Model │ ──────► │ MCP Server │ ──────► │ Scraping API │
│ (Claude) │ MCP │ (Tools) │ HTTP │ WebScraping.AI │
└──────────────┘ └─────────────┘ └──────────────────┘
│ │ │
│ ▼ │
│ ┌─────────────┐ │
└───────────────►│ Extracted │◄───────────────────┘
Natural │ Data │ JSON Response
Language └─────────────┘
The AI assistant calls your MCP server's tools, which then interact with web scraping APIs to extract and return structured data.
Setting Up Your MCP Server for Data Extraction
Python Implementation
First, install the required dependencies:
pip install mcp httpx beautifulsoup4
Create a complete MCP server with data extraction capabilities:
import asyncio
import httpx
import json
from mcp.server import Server
from mcp.server.stdio import stdio_server
from mcp.server.models import InitializationOptions
from mcp.types import Tool, TextContent
from bs4 import BeautifulSoup
# Initialize MCP server
app = Server("data-extraction-server")
# Define extraction tools
@app.list_tools()
async def list_tools() -> list[Tool]:
return [
Tool(
name="extract_html",
description="Extract raw HTML content from a URL with JavaScript rendering",
inputSchema={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The target URL to extract data from"
},
"wait_for": {
"type": "string",
"description": "CSS selector to wait for before extraction"
},
"js_timeout": {
"type": "number",
"description": "JavaScript rendering timeout in milliseconds",
"default": 2000
}
},
"required": ["url"]
}
),
Tool(
name="extract_structured_data",
description="Extract structured data fields using AI from any webpage",
inputSchema={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The target URL"
},
"fields": {
"type": "object",
"description": "Object mapping field names to extraction instructions",
"additionalProperties": {
"type": "string"
}
}
},
"required": ["url", "fields"]
}
),
Tool(
name="extract_selected_content",
description="Extract specific elements using CSS selectors",
inputSchema={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The target URL"
},
"selector": {
"type": "string",
"description": "CSS selector for the elements to extract"
}
},
"required": ["url", "selector"]
}
),
Tool(
name="extract_text",
description="Extract clean, readable text content from a webpage",
inputSchema={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The target URL"
}
},
"required": ["url"]
}
)
]
# Implement tool handlers
@app.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
API_KEY = "YOUR_WEBSCRAPING_AI_API_KEY"
BASE_URL = "https://api.webscraping.ai"
async with httpx.AsyncClient(timeout=30.0) as client:
if name == "extract_html":
response = await client.get(
f"{BASE_URL}/html",
params={
"url": arguments["url"],
"api_key": API_KEY,
"js": "true",
"wait_for": arguments.get("wait_for"),
"js_timeout": arguments.get("js_timeout", 2000)
}
)
response.raise_for_status()
return [TextContent(
type="text",
text=f"HTML Content from {arguments['url']}:\n\n{response.text}"
)]
elif name == "extract_structured_data":
response = await client.post(
f"{BASE_URL}/fields",
params={
"url": arguments["url"],
"api_key": API_KEY
},
json={"fields": arguments["fields"]}
)
response.raise_for_status()
extracted_data = response.json()
return [TextContent(
type="text",
text=f"Extracted Data:\n{json.dumps(extracted_data, indent=2)}"
)]
elif name == "extract_selected_content":
response = await client.get(
f"{BASE_URL}/selected",
params={
"url": arguments["url"],
"api_key": API_KEY,
"selector": arguments["selector"]
}
)
response.raise_for_status()
return [TextContent(
type="text",
text=f"Selected Content:\n{response.text}"
)]
elif name == "extract_text":
response = await client.get(
f"{BASE_URL}/text",
params={
"url": arguments["url"],
"api_key": API_KEY
}
)
response.raise_for_status()
text_data = response.json()
return [TextContent(
type="text",
text=f"Extracted Text:\n{json.dumps(text_data, indent=2)}"
)]
# Run the server
async def main():
async with stdio_server() as (read_stream, write_stream):
await app.run(
read_stream,
write_stream,
InitializationOptions(
server_name="data-extraction-mcp",
server_version="1.0.0"
)
)
if __name__ == "__main__":
asyncio.run(main())
JavaScript/TypeScript Implementation
For Node.js environments, install dependencies:
npm install @modelcontextprotocol/sdk axios cheerio
Create your MCP server:
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import axios from "axios";
const API_KEY = process.env.WEBSCRAPING_AI_API_KEY;
const BASE_URL = "https://api.webscraping.ai";
// Create server instance
const server = new Server(
{
name: "data-extraction-mcp-server",
version: "1.0.0",
},
{
capabilities: {
tools: {},
},
}
);
// Define extraction tools
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: [
{
name: "extract_html",
description: "Extract raw HTML with JavaScript rendering support",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "Target URL to extract from",
},
wait_for: {
type: "string",
description: "CSS selector to wait for",
},
js_timeout: {
type: "number",
description: "JavaScript timeout in ms",
default: 2000,
},
},
required: ["url"],
},
},
{
name: "extract_fields",
description: "Extract structured data fields using AI",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "Target URL",
},
fields: {
type: "object",
description: "Field definitions for extraction",
},
},
required: ["url", "fields"],
},
},
{
name: "extract_by_selector",
description: "Extract elements matching CSS selector",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "Target URL",
},
selector: {
type: "string",
description: "CSS selector",
},
},
required: ["url", "selector"],
},
},
{
name: "ask_question",
description: "Ask a question about webpage content and get AI answer",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "Target URL",
},
question: {
type: "string",
description: "Question about the page content",
},
},
required: ["url", "question"],
},
},
],
};
});
// Handle tool execution
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
try {
if (name === "extract_html") {
const response = await axios.get(`${BASE_URL}/html`, {
params: {
url: args.url,
api_key: API_KEY,
js: true,
wait_for: args.wait_for,
js_timeout: args.js_timeout || 2000,
},
});
return {
content: [
{
type: "text",
text: `HTML from ${args.url}:\n\n${response.data}`,
},
],
};
}
if (name === "extract_fields") {
const response = await axios.post(
`${BASE_URL}/fields`,
{ fields: args.fields },
{
params: {
url: args.url,
api_key: API_KEY,
},
}
);
return {
content: [
{
type: "text",
text: JSON.stringify(response.data, null, 2),
},
],
};
}
if (name === "extract_by_selector") {
const response = await axios.get(`${BASE_URL}/selected`, {
params: {
url: args.url,
api_key: API_KEY,
selector: args.selector,
},
});
return {
content: [
{
type: "text",
text: response.data,
},
],
};
}
if (name === "ask_question") {
const response = await axios.post(
`${BASE_URL}/question`,
{ question: args.question },
{
params: {
url: args.url,
api_key: API_KEY,
},
}
);
return {
content: [
{
type: "text",
text: JSON.stringify(response.data, null, 2),
},
],
};
}
throw new Error(`Unknown tool: ${name}`);
} catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${error.message}`,
},
],
isError: true,
};
}
});
// Start server
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error("Data Extraction MCP Server running");
}
main().catch(console.error);
Configuring Your MCP Server
Claude Desktop Configuration
Add your MCP server to Claude Desktop's configuration file:
macOS: ~/Library/Application Support/Claude/claude_desktop_config.json
Windows: %APPDATA%\Claude\claude_desktop_config.json
{
"mcpServers": {
"data-extraction": {
"command": "python",
"args": ["/path/to/data_extraction_mcp.py"],
"env": {
"WEBSCRAPING_AI_API_KEY": "your_api_key_here"
}
}
}
}
For Node.js servers:
{
"mcpServers": {
"data-extraction": {
"command": "node",
"args": ["/path/to/data-extraction-server.js"],
"env": {
"WEBSCRAPING_AI_API_KEY": "your_api_key_here"
}
}
}
}
Practical Data Extraction Examples
Example 1: Extracting Product Information
Using natural language with your MCP server:
User: "Extract the product title, price, and availability from
https://example.com/product/12345"
Claude (using your MCP server):
{
"title": "Premium Wireless Headphones",
"price": "$299.99",
"availability": "In Stock"
}
The MCP server executes:
# Automatically called by Claude through MCP
await call_tool(
name="extract_structured_data",
arguments={
"url": "https://example.com/product/12345",
"fields": {
"title": "The product title",
"price": "The current price",
"availability": "Whether the product is in stock"
}
}
)
Example 2: Extracting Multiple Items from a List
Similar to how you might handle AJAX requests using Puppeteer for dynamic content, MCP servers can extract data from JavaScript-rendered pages:
@app.call_tool()
async def call_tool(name: str, arguments: dict):
if name == "extract_product_list":
# First, get the HTML with JavaScript rendering
html_response = await client.get(
"https://api.webscraping.ai/html",
params={
"url": arguments["url"],
"api_key": API_KEY,
"js": "true",
"wait_for": ".product-item"
}
)
# Parse and extract structured data
soup = BeautifulSoup(html_response.text, 'html.parser')
products = []
for item in soup.select('.product-item'):
products.append({
'name': item.select_one('.product-name').text.strip(),
'price': item.select_one('.product-price').text.strip(),
'rating': item.select_one('.product-rating').text.strip()
})
return [TextContent(
type="text",
text=json.dumps(products, indent=2)
)]
Example 3: Question-Based Extraction
For complex extraction scenarios, use the question-answering approach:
// User asks: "What is the shipping policy on this page?"
if (name === "extract_shipping_policy") {
const response = await axios.post(
`${BASE_URL}/question`,
{
question: "What is the shipping policy? Include delivery times and costs."
},
{
params: {
url: args.url,
api_key: API_KEY,
},
}
);
return {
content: [
{
type: "text",
text: response.data.answer,
},
],
};
}
Advanced Data Extraction Patterns
Handling Pagination
Just as you would handle browser sessions in Puppeteer, you can implement pagination in your MCP server:
@app.call_tool()
async def call_tool(name: str, arguments: dict):
if name == "extract_all_pages":
all_data = []
current_page = 1
max_pages = arguments.get("max_pages", 10)
while current_page <= max_pages:
page_url = f"{arguments['base_url']}?page={current_page}"
response = await client.post(
"https://api.webscraping.ai/fields",
params={"url": page_url, "api_key": API_KEY},
json={
"fields": {
"items": "All product items on this page as a list",
"has_next": "Whether there is a next page (true/false)"
}
}
)
data = response.json()
all_data.extend(data.get("items", []))
if not data.get("has_next"):
break
current_page += 1
return [TextContent(
type="text",
text=json.dumps(all_data, indent=2)
)]
Error Handling and Retry Logic
Implement robust error handling:
import asyncio
from typing import Optional
async def extract_with_retry(
client: httpx.AsyncClient,
url: str,
max_retries: int = 3
) -> Optional[dict]:
"""Extract data with automatic retry on failure"""
for attempt in range(max_retries):
try:
response = await client.get(
"https://api.webscraping.ai/html",
params={
"url": url,
"api_key": API_KEY,
"js": "true"
}
)
response.raise_for_status()
return {"success": True, "data": response.text}
except httpx.HTTPError as e:
if attempt == max_retries - 1:
return {"success": False, "error": str(e)}
await asyncio.sleep(2 ** attempt) # Exponential backoff
return None
Caching Extracted Data
Improve performance with caching:
import { createHash } from 'crypto';
const cache = new Map<string, { data: any; timestamp: number }>();
const CACHE_TTL = 3600000; // 1 hour
function getCacheKey(url: string, params: any): string {
const hash = createHash('md5');
hash.update(url + JSON.stringify(params));
return hash.digest('hex');
}
async function extractWithCache(url: string, params: any) {
const cacheKey = getCacheKey(url, params);
const cached = cache.get(cacheKey);
if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
console.error('Returning cached data');
return cached.data;
}
const response = await axios.get(`${BASE_URL}/html`, {
params: { url, api_key: API_KEY, ...params },
});
cache.set(cacheKey, {
data: response.data,
timestamp: Date.now(),
});
return response.data;
}
Testing Your MCP Data Extraction Server
Unit Testing (Python)
import pytest
from unittest.mock import AsyncMock, patch
@pytest.mark.asyncio
async def test_extract_structured_data():
mock_response = AsyncMock()
mock_response.json.return_value = {
"title": "Test Product",
"price": "$99.99"
}
with patch('httpx.AsyncClient.post', return_value=mock_response):
result = await call_tool(
"extract_structured_data",
{
"url": "https://example.com",
"fields": {"title": "Product title", "price": "Price"}
}
)
assert "Test Product" in result[0].text
assert "$99.99" in result[0].text
Integration Testing
# Test your MCP server manually
echo '{"method": "tools/list"}' | python data_extraction_mcp.py
# Expected output: List of available tools
Performance Optimization
Concurrent Extraction
Extract data from multiple URLs simultaneously:
async def extract_multiple_urls(urls: list[str]) -> list[dict]:
"""Extract data from multiple URLs concurrently"""
async with httpx.AsyncClient() as client:
tasks = [
client.get(
"https://api.webscraping.ai/text",
params={"url": url, "api_key": API_KEY}
)
for url in urls
]
responses = await asyncio.gather(*tasks, return_exceptions=True)
results = []
for url, response in zip(urls, responses):
if isinstance(response, Exception):
results.append({"url": url, "error": str(response)})
else:
results.append({"url": url, "data": response.json()})
return results
Security Best Practices
- Never hardcode API keys - Use environment variables
- Validate URLs - Prevent SSRF attacks
- Rate limiting - Implement request throttling
- Input sanitization - Validate all user inputs
import os
from urllib.parse import urlparse
def validate_url(url: str) -> bool:
"""Validate URL to prevent security issues"""
try:
parsed = urlparse(url)
# Only allow http/https schemes
if parsed.scheme not in ['http', 'https']:
return False
# Ensure hostname exists
if not parsed.netloc:
return False
# Block internal networks (optional)
if parsed.hostname in ['localhost', '127.0.0.1']:
return False
return True
except:
return False
# Always use environment variables
API_KEY = os.environ.get("WEBSCRAPING_AI_API_KEY")
if not API_KEY:
raise ValueError("WEBSCRAPING_AI_API_KEY environment variable not set")
Troubleshooting Common Issues
Issue 1: MCP Server Not Responding
# Check if server is running
ps aux | grep data_extraction_mcp
# Check Claude Desktop logs (macOS)
tail -f ~/Library/Logs/Claude/mcp*.log
Issue 2: JavaScript Rendering Timeout
Increase timeout for dynamic content, similar to techniques used when you inject JavaScript into a page using Puppeteer:
response = await client.get(
"https://api.webscraping.ai/html",
params={
"url": url,
"api_key": API_KEY,
"js": "true",
"js_timeout": 5000, # Increase to 5 seconds
"wait_for": ".dynamic-content"
}
)
Issue 3: Rate Limiting
Implement throttling:
import time
from collections import deque
class RateLimiter:
def __init__(self, max_requests: int, time_window: int):
self.max_requests = max_requests
self.time_window = time_window
self.requests = deque()
async def acquire(self):
now = time.time()
# Remove old requests outside the time window
while self.requests and self.requests[0] < now - self.time_window:
self.requests.popleft()
if len(self.requests) >= self.max_requests:
sleep_time = self.requests[0] + self.time_window - now
await asyncio.sleep(sleep_time)
self.requests.append(now)
# Use in your extraction code
rate_limiter = RateLimiter(max_requests=10, time_window=60)
@app.call_tool()
async def call_tool(name: str, arguments: dict):
await rate_limiter.acquire()
# Proceed with extraction
Conclusion
Extracting data using the MCP API transforms web scraping from a code-heavy process to a natural language interaction. By building MCP servers that expose extraction tools, you enable AI assistants to intelligently scrape and extract data from websites with minimal manual intervention.
The key advantages include:
- Natural language control over complex extraction workflows
- AI-powered field recognition without writing selectors
- Reusable tools that work across multiple projects
- Standardized interface for data extraction operations
Start by implementing basic extraction tools, then expand to handle pagination, error recovery, and advanced scenarios as your needs grow. With proper error handling, caching, and security practices, MCP-based data extraction can become a powerful addition to your development toolkit.