Build high-quality datasets for machine learning. Extract structured text, labeled data, and content from the web to train your AI models.
Machine learning models are only as good as their training data. Building high-quality datasets requires collecting large volumes of structured data from diverse sources.
Public datasets are often insufficient for specialized domains. You need custom data collection tailored to your model's specific requirements.
Build datasets for any ML application
Clean text extraction for NLP, sentiment analysis, and language models.
Extract content with categories, tags, and classification labels.
Tables, lists, and structured information in JSON format.
Extract question-answer pairs from FAQs and documentation.
Extract training data from web sources
curl -G "https://api.webscraping.ai/text" \
--data-urlencode "api_key=YOUR_API_KEY" \
--data-urlencode "url=https://example.com/article/tech-trends"
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client
client = Client(api_key="YOUR_API_KEY")
text = client.text("https://example.com/article/tech-trends")
print(text)
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';
const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const text = await client.text({
url: 'https://example.com/article/tech-trends',
});
console.log(text);
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';
use WebScrapingAI\Client;
$client = new Client('YOUR_API_KEY');
$text = $client->text('https://example.com/article/tech-trends');
echo is_string($text) ? $text : print_r($text, true);
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'
client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
text = client.text('https://example.com/article/tech-trends')
puts text.inspect
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main
import (
"context"
"fmt"
webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)
func main() {
client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
text, _ := client.Text(context.Background(), &webscrapingai.TextOptions{
URL: "https://example.com/article/tech-trends",
})
fmt.Println(text)
}
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.TextOptions;
Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
String text = client.text(TextOptions.builder()
.url("https://example.com/article/tech-trends")
.build());
System.out.println(text);
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;
var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var text = await client.TextAsync(new TextRequest {
Url = "https://example.com/article/tech-trends",
});
Console.WriteLine(text);
curl -G "https://api.webscraping.ai/ai/fields" \
--data-urlencode "api_key=YOUR_API_KEY" \
--data-urlencode "url=https://ecommerce-site.com/product/wireless-mouse" \
--data-urlencode "fields[title]=Product title" \
--data-urlencode "fields[description]=Product description text" \
--data-urlencode "fields[category]=Product category" \
--data-urlencode "fields[subcategory]=Product subcategory" \
--data-urlencode "fields[features]=Array of product features" \
--data-urlencode "fields[sentiment_indicators]=Positive and negative words used in description"
# Response:
# {
# "title": "Wireless Gaming Mouse Pro",
# "description": "High-precision wireless gaming mouse with...",
# "category": "Electronics",
# "subcategory": "Computer Accessories",
# "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
# "sentiment_indicators": {
# "positive": ["professional", "precision", "comfortable"],
# "negative": []
# }
# }
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client
client = Client(api_key="YOUR_API_KEY")
result = client.fields(
"https://ecommerce-site.com/product/wireless-mouse",
fields={
"title": "Product title",
"description": "Product description text",
"category": "Product category",
"subcategory": "Product subcategory",
"features": "Array of product features",
"sentiment_indicators": "Positive and negative words used in description",
},
)
print(result)
# Response:
# {
# "title": "Wireless Gaming Mouse Pro",
# "description": "High-precision wireless gaming mouse with...",
# "category": "Electronics",
# "subcategory": "Computer Accessories",
# "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
# "sentiment_indicators": {
# "positive": ["professional", "precision", "comfortable"],
# "negative": []
# }
# }
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';
const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const result = await client.fields({
url: 'https://ecommerce-site.com/product/wireless-mouse',
fields: {
title: 'Product title',
description: 'Product description text',
category: 'Product category',
subcategory: 'Product subcategory',
features: 'Array of product features',
sentiment_indicators: 'Positive and negative words used in description',
},
});
console.log(result);
// Response:
// {
// "title": "Wireless Gaming Mouse Pro",
// "description": "High-precision wireless gaming mouse with...",
// "category": "Electronics",
// "subcategory": "Computer Accessories",
// "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
// "sentiment_indicators": {
// "positive": ["professional", "precision", "comfortable"],
// "negative": []
// }
// }
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';
use WebScrapingAI\Client;
$client = new Client('YOUR_API_KEY');
$result = $client->fields('https://ecommerce-site.com/product/wireless-mouse', [
'title' => 'Product title',
'description' => 'Product description text',
'category' => 'Product category',
'subcategory' => 'Product subcategory',
'features' => 'Array of product features',
'sentiment_indicators' => 'Positive and negative words used in description',
]);
print_r($result);
// Response:
// {
// "title": "Wireless Gaming Mouse Pro",
// "description": "High-precision wireless gaming mouse with...",
// "category": "Electronics",
// "subcategory": "Computer Accessories",
// "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
// "sentiment_indicators": {
// "positive": ["professional", "precision", "comfortable"],
// "negative": []
// }
// }
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'
client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
result = client.fields(
'https://ecommerce-site.com/product/wireless-mouse',
fields: {
title: 'Product title',
description: 'Product description text',
category: 'Product category',
subcategory: 'Product subcategory',
features: 'Array of product features',
sentiment_indicators: 'Positive and negative words used in description',
}
)
puts result.inspect
# Response:
# {
# "title": "Wireless Gaming Mouse Pro",
# "description": "High-precision wireless gaming mouse with...",
# "category": "Electronics",
# "subcategory": "Computer Accessories",
# "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
# "sentiment_indicators": {
# "positive": ["professional", "precision", "comfortable"],
# "negative": []
# }
# }
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main
import (
"context"
"fmt"
webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)
func main() {
client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
result, _ := client.Fields(context.Background(), &webscrapingai.FieldsOptions{
URL: "https://ecommerce-site.com/product/wireless-mouse",
Fields: map[string]string{
"title": "Product title",
"description": "Product description text",
"category": "Product category",
"subcategory": "Product subcategory",
"features": "Array of product features",
"sentiment_indicators": "Positive and negative words used in description",
},
})
fmt.Println(result.Result)
}
// Response:
// {
// "title": "Wireless Gaming Mouse Pro",
// "description": "High-precision wireless gaming mouse with...",
// "category": "Electronics",
// "subcategory": "Computer Accessories",
// "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
// "sentiment_indicators": {
// "positive": ["professional", "precision", "comfortable"],
// "negative": []
// }
// }
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.FieldsOptions;
import ai.webscraping.result.FieldsResult;
Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
FieldsResult result = client.fields(FieldsOptions.builder()
.url("https://ecommerce-site.com/product/wireless-mouse")
.addField("title", "Product title")
.addField("description", "Product description text")
.addField("category", "Product category")
.addField("subcategory", "Product subcategory")
.addField("features", "Array of product features")
.addField("sentiment_indicators", "Positive and negative words used in description")
.build());
System.out.println(result.getResult());
// Response:
// {
// "title": "Wireless Gaming Mouse Pro",
// "description": "High-precision wireless gaming mouse with...",
// "category": "Electronics",
// "subcategory": "Computer Accessories",
// "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
// "sentiment_indicators": {
// "positive": ["professional", "precision", "comfortable"],
// "negative": []
// }
// }
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;
var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var result = await client.FieldsAsync(new FieldsRequest {
Url = "https://ecommerce-site.com/product/wireless-mouse",
Fields = new Dictionary<string, string> {
["title"] = "Product title",
["description"] = "Product description text",
["category"] = "Product category",
["subcategory"] = "Product subcategory",
["features"] = "Array of product features",
["sentiment_indicators"] = "Positive and negative words used in description",
},
});
Console.WriteLine(result.Result);
// Response:
// {
// "title": "Wireless Gaming Mouse Pro",
// "description": "High-precision wireless gaming mouse with...",
// "category": "Electronics",
// "subcategory": "Computer Accessories",
// "features": ["16000 DPI", "RGB lighting", "Ergonomic design"],
// "sentiment_indicators": {
// "positive": ["professional", "precision", "comfortable"],
// "negative": []
// }
// }
Extract labeled text for sentiment, topic, or intent classification
Collect examples of entities in context
Build Q&A datasets from FAQ pages and forums
Collect content for retrieval-augmented generation
More data collection solutions
Get started with 1,000 free API credits. No credit card required.