What is the ChatGPT API tutorial for data extraction?
The ChatGPT API provides a powerful way to extract structured data from unstructured text, HTML, or web content using natural language processing. This tutorial will guide you through setting up the OpenAI API, crafting effective prompts, and implementing data extraction workflows in Python and JavaScript.
Understanding ChatGPT API for Data Extraction
The ChatGPT API (officially called the OpenAI Chat Completions API) allows developers to send text content to GPT models and receive structured responses. Unlike traditional web scraping that relies on CSS selectors or XPath, ChatGPT can understand context, handle variations in formatting, and extract relevant information using natural language instructions.
Key Advantages
- Flexibility: Works with varying HTML structures without updating selectors
- Context awareness: Understands semantic meaning, not just patterns
- Multi-format output: Can return JSON, CSV, or any structured format
- Handles edge cases: Adapts to unexpected layouts or missing data
Prerequisites
Before starting, you'll need:
- An OpenAI API key (get one at platform.openai.com)
- Python 3.7+ or Node.js 14+
- Basic understanding of HTTP requests and JSON
Setting Up ChatGPT API in Python
First, install the OpenAI Python library:
pip install openai requests beautifulsoup4
Here's a basic setup for data extraction:
import openai
import requests
from bs4 import BeautifulSoup
import json
# Configure API key
openai.api_key = "your-api-key-here"
def extract_data_with_gpt(html_content, extraction_prompt):
"""
Extract structured data from HTML using ChatGPT API
"""
# Create the system message
system_message = """You are a data extraction expert.
Extract information from the provided HTML and return it as valid JSON.
Only return the JSON object, no additional text."""
# Create the user message with HTML content
user_message = f"{extraction_prompt}\n\nHTML Content:\n{html_content}"
# Call ChatGPT API
response = openai.ChatCompletion.create(
model="gpt-4-turbo-preview", # or gpt-3.5-turbo for lower cost
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": user_message}
],
temperature=0.1, # Low temperature for consistent extraction
response_format={"type": "json_object"} # Ensures JSON output
)
# Parse and return the extracted data
extracted_data = json.loads(response.choices[0].message.content)
return extracted_data
# Example usage: Extract product information
url = "https://example-ecommerce.com/product/12345"
response = requests.get(url)
html = response.text
# Clean HTML with BeautifulSoup (optional but recommended)
soup = BeautifulSoup(html, 'html.parser')
# Remove scripts and styles
for script in soup(["script", "style"]):
script.decompose()
clean_html = soup.get_text(separator="\n", strip=True)
# Define extraction prompt
prompt = """
Extract the following product information:
- Product name
- Price (as a number)
- Description
- Available sizes
- In stock status (boolean)
- Rating (if available)
"""
# Extract data
product_data = extract_data_with_gpt(clean_html[:4000], prompt)
print(json.dumps(product_data, indent=2))
Setting Up ChatGPT API in JavaScript
Install the required packages:
npm install openai axios cheerio
Here's the JavaScript implementation:
const OpenAI = require('openai');
const axios = require('axios');
const cheerio = require('cheerio');
// Initialize OpenAI client
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY
});
async function extractDataWithGPT(htmlContent, extractionPrompt) {
const systemMessage = `You are a data extraction expert.
Extract information from the provided HTML and return it as valid JSON.
Only return the JSON object, no additional text.`;
const userMessage = `${extractionPrompt}\n\nHTML Content:\n${htmlContent}`;
try {
const response = await openai.chat.completions.create({
model: "gpt-4-turbo-preview",
messages: [
{ role: "system", content: systemMessage },
{ role: "user", content: userMessage }
],
temperature: 0.1,
response_format: { type: "json_object" }
});
const extractedData = JSON.parse(response.choices[0].message.content);
return extractedData;
} catch (error) {
console.error('Error extracting data:', error);
throw error;
}
}
// Example: Extract article metadata
async function scrapeArticle(url) {
// Fetch the webpage
const response = await axios.get(url);
const html = response.data;
// Clean HTML with cheerio
const $ = cheerio.load(html);
$('script, style, nav, footer').remove();
const cleanText = $('body').text().replace(/\s+/g, ' ').trim();
// Define extraction prompt
const prompt = `
Extract the following information from this article:
- Title
- Author name
- Publication date
- Main topic/category
- Reading time estimate
- Summary (2-3 sentences)
`;
// Extract data
const articleData = await extractDataWithGPT(
cleanText.substring(0, 4000),
prompt
);
return articleData;
}
// Usage
scrapeArticle('https://example.com/article')
.then(data => console.log(JSON.stringify(data, null, 2)))
.catch(err => console.error(err));
Advanced Techniques
1. Function Calling for Structured Output
OpenAI's function calling feature ensures consistent structured output:
def extract_with_function_calling(content):
response = openai.ChatCompletion.create(
model="gpt-4-turbo-preview",
messages=[
{"role": "user", "content": f"Extract product data: {content}"}
],
functions=[
{
"name": "save_product_data",
"description": "Save extracted product information",
"parameters": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"currency": {"type": "string"},
"in_stock": {"type": "boolean"},
"rating": {"type": "number"},
"reviews_count": {"type": "integer"}
},
"required": ["name", "price"]
}
}
],
function_call={"name": "save_product_data"}
)
function_args = json.loads(
response.choices[0].message.function_call.arguments
)
return function_args
2. Batch Processing for Multiple Pages
When extracting data from multiple pages, implement batching to manage API costs:
import time
from concurrent.futures import ThreadPoolExecutor
def batch_extract(urls, max_workers=3):
"""Extract data from multiple URLs with rate limiting"""
results = []
def process_url(url):
try:
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
clean_text = soup.get_text(separator="\n", strip=True)
data = extract_data_with_gpt(
clean_text[:4000],
"Extract product name, price, and availability"
)
# Rate limiting
time.sleep(1)
return {"url": url, "data": data, "success": True}
except Exception as e:
return {"url": url, "error": str(e), "success": False}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(process_url, urls))
return results
3. Cost Optimization Strategies
ChatGPT API charges based on tokens. Here's how to optimize costs:
def optimize_html_for_extraction(html, max_chars=4000):
"""Reduce HTML content while preserving important information"""
soup = BeautifulSoup(html, 'html.parser')
# Remove unnecessary elements
for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
tag.decompose()
# Focus on main content
main_content = soup.find('main') or soup.find('article') or soup.body
if main_content:
text = main_content.get_text(separator="\n", strip=True)
else:
text = soup.get_text(separator="\n", strip=True)
# Truncate to max length
return text[:max_chars]
Combining with Traditional Web Scraping
For optimal results, combine ChatGPT API with traditional scraping methods. When working with dynamic content or JavaScript-heavy pages, first use a headless browser to render the page, then apply ChatGPT for extraction:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def scrape_dynamic_page_with_gpt(url):
# Set up headless browser
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
# Wait for content to load
time.sleep(3)
# Get rendered HTML
html = driver.page_source
# Clean and extract
soup = BeautifulSoup(html, 'html.parser')
clean_text = optimize_html_for_extraction(html)
# Use ChatGPT for extraction
data = extract_data_with_gpt(
clean_text,
"Extract all product listings with name, price, and image URL"
)
return data
finally:
driver.quit()
Error Handling and Validation
Implement robust error handling for production use:
async function safeExtractData(html, prompt, retries = 3) {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
const data = await extractDataWithGPT(html, prompt);
// Validate extracted data
if (!data || Object.keys(data).length === 0) {
throw new Error('Empty data returned');
}
return data;
} catch (error) {
console.error(`Attempt ${attempt} failed:`, error.message);
if (attempt === retries) {
throw new Error(`Failed after ${retries} attempts: ${error.message}`);
}
// Exponential backoff
await new Promise(resolve => setTimeout(resolve, 1000 * attempt));
}
}
}
Best Practices
- Use specific prompts: Clearly define the exact fields you want to extract
- Set low temperature: Use 0.1-0.3 for consistent, deterministic extraction
- Validate output: Always verify the extracted data structure matches expectations
- Handle rate limits: Implement proper delays between API calls
- Monitor costs: Track token usage and set budget alerts
- Clean HTML first: Remove scripts, styles, and irrelevant content before sending to API
- Chunk large content: Split content exceeding token limits into smaller sections
Real-World Example: E-commerce Product Scraper
Here's a complete example that scrapes product listings:
import openai
import requests
from bs4 import BeautifulSoup
import json
import time
class GPTProductScraper:
def __init__(self, api_key):
openai.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def scrape_product(self, url):
# Fetch page
response = self.session.get(url)
response.raise_for_status()
# Clean HTML
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup(['script', 'style', 'nav', 'footer']):
tag.decompose()
clean_html = soup.get_text(separator="\n", strip=True)[:4000]
# Extract with GPT
prompt = """
Extract the following product information and return as JSON:
{
"name": "product name",
"brand": "brand name",
"price": numeric value,
"currency": "USD/EUR/etc",
"description": "brief description",
"features": ["feature1", "feature2"],
"in_stock": true/false,
"rating": numeric value or null,
"review_count": number or null
}
"""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Extract product data as JSON."},
{"role": "user", "content": f"{prompt}\n\n{clean_html}"}
],
temperature=0.1,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def scrape_multiple(self, urls, delay=2):
results = []
for url in urls:
try:
data = self.scrape_product(url)
results.append({"url": url, "data": data})
time.sleep(delay)
except Exception as e:
results.append({"url": url, "error": str(e)})
return results
# Usage
scraper = GPTProductScraper("your-api-key")
products = scraper.scrape_multiple([
"https://example.com/product1",
"https://example.com/product2"
])
print(json.dumps(products, indent=2))
Conclusion
The ChatGPT API offers a flexible, powerful approach to data extraction that complements traditional web scraping methods. While it comes with API costs, the ability to handle varying page structures and extract semantic information makes it invaluable for complex scraping tasks. By following the best practices and examples in this tutorial, you can build robust data extraction pipelines that combine the strengths of both AI-powered and traditional scraping techniques.
For more advanced scraping scenarios, consider exploring how to handle browser sessions when dealing with authenticated content or monitoring network requests to capture dynamic data before applying ChatGPT for extraction.