How do I Configure MCP Server Setup for My Project?
Configuring an MCP (Model Context Protocol) server for your web scraping project requires careful setup of configuration files, environment variables, and proper initialization of server components. This guide provides comprehensive instructions for configuring MCP servers in various environments and use cases.
Understanding MCP Server Configuration
MCP servers act as bridges between AI models and external tools, including web scraping capabilities. Proper configuration ensures secure, efficient communication between your application and the MCP server infrastructure.
Core Configuration Components
An MCP server configuration typically consists of:
- Server initialization parameters
- Transport layer configuration (stdio, HTTP, SSE)
- Authentication and security settings
- Tool and resource definitions
- Connection timeout and retry policies
Configuring MCP Server in Python
Python developers can configure MCP servers using the official MCP SDK. Here's a comprehensive setup example:
Installation and Basic Setup
# Install the MCP Python SDK
pip install mcp
# Install additional dependencies for web scraping
pip install httpx beautifulsoup4 playwright
Creating a Configuration File
Create a mcp_config.json
file in your project root:
{
"mcpServers": {
"webscraping": {
"command": "python",
"args": ["-m", "mcp_server.scraping"],
"env": {
"SCRAPING_TIMEOUT": "30000",
"MAX_CONCURRENT_REQUESTS": "5",
"USER_AGENT": "MCP-Scraper/1.0"
}
},
"playwright": {
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-playwright"],
"env": {
"BROWSER_TYPE": "chromium",
"HEADLESS": "true"
}
}
}
}
Implementing the MCP Server
Create a mcp_server/scraping.py
file:
import asyncio
from mcp.server import Server, NotificationOptions
from mcp.server.models import InitializationOptions
from mcp.server.stdio import stdio_server
from mcp.types import Tool, TextContent
import httpx
from bs4 import BeautifulSoup
# Initialize the MCP server
app = Server("webscraping-server")
# Configure server capabilities
@app.list_tools()
async def list_tools() -> list[Tool]:
"""Define available scraping tools."""
return [
Tool(
name="scrape_html",
description="Scrape HTML content from a URL",
inputSchema={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to scrape"
},
"selector": {
"type": "string",
"description": "CSS selector to extract specific elements"
},
"wait_time": {
"type": "number",
"description": "Wait time in seconds before scraping"
}
},
"required": ["url"]
}
),
Tool(
name="extract_links",
description="Extract all links from a webpage",
inputSchema={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to extract links from"
},
"filter_pattern": {
"type": "string",
"description": "Regex pattern to filter links"
}
},
"required": ["url"]
}
)
]
# Implement tool handlers
@app.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
"""Handle tool execution."""
if name == "scrape_html":
return await scrape_html_tool(arguments)
elif name == "extract_links":
return await extract_links_tool(arguments)
else:
raise ValueError(f"Unknown tool: {name}")
async def scrape_html_tool(args: dict) -> list[TextContent]:
"""Scrape HTML content from a URL."""
url = args["url"]
selector = args.get("selector")
wait_time = args.get("wait_time", 0)
# Add delay if specified
if wait_time > 0:
await asyncio.sleep(wait_time)
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=30.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
if selector:
elements = soup.select(selector)
content = "\n".join([str(el) for el in elements])
else:
content = str(soup)
return [TextContent(
type="text",
text=f"Successfully scraped {url}\n\n{content}"
)]
async def extract_links_tool(args: dict) -> list[TextContent]:
"""Extract links from a webpage."""
url = args["url"]
filter_pattern = args.get("filter_pattern")
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=30.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
links = [a.get('href') for a in soup.find_all('a', href=True)]
if filter_pattern:
import re
pattern = re.compile(filter_pattern)
links = [link for link in links if pattern.search(link)]
return [TextContent(
type="text",
text=f"Found {len(links)} links:\n" + "\n".join(links)
)]
# Run the server
async def main():
"""Start the MCP server."""
async with stdio_server() as (read_stream, write_stream):
await app.run(
read_stream,
write_stream,
InitializationOptions(
server_name="webscraping-server",
server_version="1.0.0",
capabilities=app.get_capabilities(
notification_options=NotificationOptions(),
experimental_capabilities={}
)
)
)
if __name__ == "__main__":
asyncio.run(main())
Configuring MCP Server in JavaScript/Node.js
JavaScript developers can configure MCP servers using the official TypeScript SDK:
Installation
# Install the MCP SDK
npm install @modelcontextprotocol/sdk
# Install web scraping dependencies
npm install cheerio axios playwright
Creating Configuration File
Create a mcp-config.json
:
{
"mcpServers": {
"scraper": {
"command": "node",
"args": ["./src/mcp-server.js"],
"env": {
"NODE_ENV": "production",
"TIMEOUT": "30000",
"MAX_RETRIES": "3"
}
}
}
}
Implementing the Server
Create src/mcp-server.js
:
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import axios from "axios";
import * as cheerio from "cheerio";
// Initialize MCP server
const server = new Server(
{
name: "webscraping-mcp-server",
version: "1.0.0",
},
{
capabilities: {
tools: {},
},
}
);
// Define available tools
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: [
{
name: "scrape_webpage",
description: "Scrape content from a webpage using CSS selectors",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "The URL to scrape",
},
selector: {
type: "string",
description: "CSS selector for elements to extract",
},
attribute: {
type: "string",
description: "Optional attribute to extract (e.g., 'href', 'src')",
},
},
required: ["url"],
},
},
{
name: "get_page_metadata",
description: "Extract metadata from a webpage",
inputSchema: {
type: "object",
properties: {
url: {
type: "string",
description: "The URL to analyze",
},
},
required: ["url"],
},
},
],
};
});
// Handle tool calls
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
if (name === "scrape_webpage") {
return await scrapeWebpage(args);
} else if (name === "get_page_metadata") {
return await getPageMetadata(args);
} else {
throw new Error(`Unknown tool: ${name}`);
}
});
// Tool implementation
async function scrapeWebpage(args) {
const { url, selector, attribute } = args;
try {
const response = await axios.get(url, {
timeout: parseInt(process.env.TIMEOUT) || 30000,
headers: {
"User-Agent": "MCP-Scraper/1.0",
},
});
const $ = cheerio.load(response.data);
let results = [];
if (selector) {
$(selector).each((i, elem) => {
if (attribute) {
results.push($(elem).attr(attribute));
} else {
results.push($(elem).text().trim());
}
});
} else {
results = [$.text()];
}
return {
content: [
{
type: "text",
text: JSON.stringify(results, null, 2),
},
],
};
} catch (error) {
return {
content: [
{
type: "text",
text: `Error scraping ${url}: ${error.message}`,
},
],
isError: true,
};
}
}
async function getPageMetadata(args) {
const { url } = args;
try {
const response = await axios.get(url, {
timeout: parseInt(process.env.TIMEOUT) || 30000,
});
const $ = cheerio.load(response.data);
const metadata = {
title: $("title").text(),
description: $('meta[name="description"]').attr("content"),
keywords: $('meta[name="keywords"]').attr("content"),
ogTitle: $('meta[property="og:title"]').attr("content"),
ogDescription: $('meta[property="og:description"]').attr("content"),
canonical: $('link[rel="canonical"]').attr("href"),
};
return {
content: [
{
type: "text",
text: JSON.stringify(metadata, null, 2),
},
],
};
} catch (error) {
return {
content: [
{
type: "text",
text: `Error extracting metadata from ${url}: ${error.message}`,
},
],
isError: true,
};
}
}
// Start the server
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error("MCP Webscraping Server running on stdio");
}
main().catch((error) => {
console.error("Fatal error:", error);
process.exit(1);
});
Advanced Configuration Options
Environment Variables
Create a .env
file for sensitive configuration:
# API Configuration
MCP_SERVER_HOST=localhost
MCP_SERVER_PORT=3000
# Scraping Settings
DEFAULT_TIMEOUT=30000
MAX_RETRIES=3
RETRY_DELAY=1000
# Browser Configuration
BROWSER_HEADLESS=true
BROWSER_TYPE=chromium
# Proxy Settings (optional)
HTTP_PROXY=http://proxy.example.com:8080
HTTPS_PROXY=https://proxy.example.com:8080
# Rate Limiting
RATE_LIMIT_REQUESTS=100
RATE_LIMIT_WINDOW=60000
Docker Configuration
For deploying MCP servers in containers, create a Dockerfile
:
FROM node:18-alpine
# Install Playwright dependencies
RUN apk add --no-cache \
chromium \
nss \
freetype \
harfbuzz \
ca-certificates \
ttf-freefont
# Set environment variables
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
WORKDIR /app
# Copy package files
COPY package*.json ./
# Install dependencies
RUN npm ci --production
# Copy application files
COPY . .
# Run the MCP server
CMD ["node", "src/mcp-server.js"]
And docker-compose.yml
:
version: '3.8'
services:
mcp-server:
build: .
environment:
- NODE_ENV=production
- TIMEOUT=30000
- MAX_RETRIES=3
volumes:
- ./config:/app/config:ro
restart: unless-stopped
Integration with Browser Automation
For projects requiring advanced browser automation, you can integrate Puppeteer for browser session handling or configure Playwright within your MCP server:
from playwright.async_api import async_playwright
@app.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
if name == "scrape_dynamic_page":
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(arguments["url"])
await page.wait_for_load_state("networkidle")
content = await page.content()
await browser.close()
return [TextContent(type="text", text=content)]
Best Practices for MCP Server Configuration
- Use Environment Variables: Store sensitive data and configuration in environment variables, never in code
- Implement Proper Error Handling: Catch and log errors appropriately to prevent server crashes
- Set Appropriate Timeouts: Configure timeouts based on your scraping targets to avoid hanging requests
- Enable Logging: Implement comprehensive logging for debugging and monitoring
- Secure Your Server: Use authentication and rate limiting to protect your MCP server
- Version Control: Keep configuration files in version control (excluding sensitive data)
- Test Configuration: Always test your MCP server configuration in a staging environment first
Testing Your Configuration
Create a test script to verify your MCP server setup:
import asyncio
from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client
async def test_mcp_server():
server_params = StdioServerParameters(
command="python",
args=["-m", "mcp_server.scraping"],
env={"SCRAPING_TIMEOUT": "30000"}
)
async with stdio_client(server_params) as (read, write):
async with ClientSession(read, write) as session:
await session.initialize()
# List available tools
tools = await session.list_tools()
print(f"Available tools: {[tool.name for tool in tools.tools]}")
# Test a tool
result = await session.call_tool(
"scrape_html",
{"url": "https://example.com"}
)
print(f"Result: {result.content[0].text}")
if __name__ == "__main__":
asyncio.run(test_mcp_server())
Troubleshooting Common Configuration Issues
Connection Failures
If your MCP server fails to connect:
- Verify the command path is correct
- Check environment variables are properly set
- Ensure all dependencies are installed
- Review server logs for error messages
Performance Issues
To optimize MCP server performance:
- Implement connection pooling for HTTP requests
- Use caching for frequently accessed data
- Configure appropriate timeout values
- Implement request queuing for rate limiting
Authentication Errors
When dealing with authentication requirements, ensure your configuration includes proper credential management and token refresh logic.
Conclusion
Configuring an MCP server for web scraping requires careful consideration of your project's specific needs, including transport protocols, tool definitions, and environment settings. By following the configuration patterns outlined in this guide, you can build robust, scalable MCP servers that integrate seamlessly with your web scraping workflows.
Remember to regularly update your MCP server dependencies, monitor performance metrics, and adjust configuration parameters based on real-world usage patterns. A well-configured MCP server provides a powerful foundation for building AI-enhanced web scraping applications.