AI & MACHINE LEARNING

LLM Fine-tuning Data

Collect domain-specific data for fine-tuning language models. Build instruction datasets, Q&A pairs, and specialized training corpora.

Fine-tuning Needs Domain Data

Off-the-shelf LLMs lack expertise in your specific domain. Fine-tuning with high-quality, domain-specific data creates models that truly understand your use case.

Building instruction datasets manually is expensive and slow. You need automated collection of examples that match your target format and domain.

WebScraping.AI Solution

  • Q&A Extraction: Extract question-answer pairs from FAQs and forums
  • Instruction Format: Generate instruction-response training examples
  • Domain Content: Collect specialized content from expert sources
  • Format Control: Structure data exactly as your model requires

Fine-tuning Data Types

Collect the data your model needs

Q&A Pairs

Extract questions and answers from FAQs, forums, and documentation.

Instructions

Generate instruction-response pairs for RLHF and SFT training.

Domain Corpus

Build specialized text corpora for continued pre-training.

Conversations

Extract multi-turn conversations for chat model training.

Code Examples

Collect LLM fine-tuning data

Extract Q&A pairs from a FAQ page
curl -G "https://api.webscraping.ai/ai/fields" \
  --data-urlencode "api_key=YOUR_API_KEY" \
  --data-urlencode "url=https://support.example.com/faq" \
  --data-urlencode "fields[qa_pairs]=Array of objects with question and answer fields for each FAQ item"
# Response:
# {
#   "qa_pairs": [
#     {
#       "question": "How do I reset my password?",
#       "answer": "Click the 'Forgot Password' link on the login page..."
#     },
#     {
#       "question": "What payment methods do you accept?",
#       "answer": "We accept Visa, Mastercard, PayPal..."
#     }
#   ]
# }
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client

client = Client(api_key="YOUR_API_KEY")
result = client.fields(
    "https://support.example.com/faq",
    fields={
        "qa_pairs": "Array of objects with question and answer fields for each FAQ item",
    },
)
print(result)
# Response:
# {
#   "qa_pairs": [
#     {
#       "question": "How do I reset my password?",
#       "answer": "Click the 'Forgot Password' link on the login page..."
#     },
#     {
#       "question": "What payment methods do you accept?",
#       "answer": "We accept Visa, Mastercard, PayPal..."
#     }
#   ]
# }
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';

const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const result = await client.fields({
  url: 'https://support.example.com/faq',
  fields: {
    qa_pairs: 'Array of objects with question and answer fields for each FAQ item',
  },
});
console.log(result);
// Response:
// {
//   "qa_pairs": [
//     {
//       "question": "How do I reset my password?",
//       "answer": "Click the 'Forgot Password' link on the login page..."
//     },
//     {
//       "question": "What payment methods do you accept?",
//       "answer": "We accept Visa, Mastercard, PayPal..."
//     }
//   ]
// }
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';

use WebScrapingAI\Client;

$client = new Client('YOUR_API_KEY');
$result = $client->fields('https://support.example.com/faq', [
    'qa_pairs' => 'Array of objects with question and answer fields for each FAQ item',
]);
print_r($result);
// Response:
// {
//   "qa_pairs": [
//     {
//       "question": "How do I reset my password?",
//       "answer": "Click the 'Forgot Password' link on the login page..."
//     },
//     {
//       "question": "What payment methods do you accept?",
//       "answer": "We accept Visa, Mastercard, PayPal..."
//     }
//   ]
// }
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'

client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
result = client.fields(
  'https://support.example.com/faq',
  fields: {
    qa_pairs:  'Array of objects with question and answer fields for each FAQ item',
  }
)
puts result.inspect
# Response:
# {
#   "qa_pairs": [
#     {
#       "question": "How do I reset my password?",
#       "answer": "Click the 'Forgot Password' link on the login page..."
#     },
#     {
#       "question": "What payment methods do you accept?",
#       "answer": "We accept Visa, Mastercard, PayPal..."
#     }
#   ]
# }
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main

import (
    "context"
    "fmt"

    webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)

func main() {
    client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
    result, _ := client.Fields(context.Background(), &webscrapingai.FieldsOptions{
        URL: "https://support.example.com/faq",
        Fields: map[string]string{
            "qa_pairs": "Array of objects with question and answer fields for each FAQ item",
        },
    })
    fmt.Println(result.Result)
}
// Response:
// {
//   "qa_pairs": [
//     {
//       "question": "How do I reset my password?",
//       "answer": "Click the 'Forgot Password' link on the login page..."
//     },
//     {
//       "question": "What payment methods do you accept?",
//       "answer": "We accept Visa, Mastercard, PayPal..."
//     }
//   ]
// }
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.FieldsOptions;
import ai.webscraping.result.FieldsResult;

Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
FieldsResult result = client.fields(FieldsOptions.builder()
    .url("https://support.example.com/faq")
    .addField("qa_pairs", "Array of objects with question and answer fields for each FAQ item")
    .build());
System.out.println(result.getResult());
// Response:
// {
//   "qa_pairs": [
//     {
//       "question": "How do I reset my password?",
//       "answer": "Click the 'Forgot Password' link on the login page..."
//     },
//     {
//       "question": "What payment methods do you accept?",
//       "answer": "We accept Visa, Mastercard, PayPal..."
//     }
//   ]
// }
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;

var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var result = await client.FieldsAsync(new FieldsRequest {
    Url = "https://support.example.com/faq",
    Fields = new Dictionary<string, string> {
        ["qa_pairs"] = "Array of objects with question and answer fields for each FAQ item",
    },
});
Console.WriteLine(result.Result);
// Response:
// {
//   "qa_pairs": [
//     {
//       "question": "How do I reset my password?",
//       "answer": "Click the 'Forgot Password' link on the login page..."
//     },
//     {
//       "question": "What payment methods do you accept?",
//       "answer": "We accept Visa, Mastercard, PayPal..."
//     }
//   ]
// }
Generate instruction-format training examples
curl -G "https://api.webscraping.ai/ai/question" \
  --data-urlencode "api_key=YOUR_API_KEY" \
  --data-urlencode "url=https://docs.example.com/tutorial/getting-started" \
  --data-urlencode "question=Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered."
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client

client = Client(api_key="YOUR_API_KEY")
answer = client.question(
    "https://docs.example.com/tutorial/getting-started",
    question="Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.",
)
print(answer)
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';

const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const answer = await client.question({
  url: 'https://docs.example.com/tutorial/getting-started',
  question: 'Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.',
});
console.log(answer);
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';

use WebScrapingAI\Client;

$client = new Client('YOUR_API_KEY');
$answer = $client->question(
    'https://docs.example.com/tutorial/getting-started',
    'Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.',
);
echo $answer;
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'

client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
answer = client.question(
  'https://docs.example.com/tutorial/getting-started',
  question: 'Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.'
)
puts answer
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main

import (
    "context"
    "fmt"

    webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)

func main() {
    client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
    answer, _ := client.Question(context.Background(), &webscrapingai.QuestionOptions{
        URL:      "https://docs.example.com/tutorial/getting-started",
        Question: "Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.",
    })
    fmt.Println(answer)
}
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.QuestionOptions;

Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
String answer = client.question(QuestionOptions.builder()
    .url("https://docs.example.com/tutorial/getting-started")
    .question("Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.")
    .build());
System.out.println(answer);
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;

var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var answer = await client.QuestionAsync(new QuestionRequest {
    Url = "https://docs.example.com/tutorial/getting-started",
    Question = "Convert this tutorial into 5 instruction-response pairs. Format: {"instruction": "user request", "response": "assistant answer"}. Focus on practical tasks covered.",
});
Console.WriteLine(answer);
Extract domain-specific terminology
curl -G "https://api.webscraping.ai/ai/fields" \
  --data-urlencode "api_key=YOUR_API_KEY" \
  --data-urlencode "url=https://medical-reference.com/glossary" \
  --data-urlencode "fields[terms]=Array of {term, definition, usage_example} objects for each medical term"
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client

client = Client(api_key="YOUR_API_KEY")
result = client.fields(
    "https://medical-reference.com/glossary",
    fields={
        "terms": "Array of {term, definition, usage_example} objects for each medical term",
    },
)
print(result)
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';

const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const result = await client.fields({
  url: 'https://medical-reference.com/glossary',
  fields: {
    terms: 'Array of {term, definition, usage_example} objects for each medical term',
  },
});
console.log(result);
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';

use WebScrapingAI\Client;

$client = new Client('YOUR_API_KEY');
$result = $client->fields('https://medical-reference.com/glossary', [
    'terms' => 'Array of {term, definition, usage_example} objects for each medical term',
]);
print_r($result);
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'

client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
result = client.fields(
  'https://medical-reference.com/glossary',
  fields: {
    terms:  'Array of {term, definition, usage_example} objects for each medical term',
  }
)
puts result.inspect
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main

import (
    "context"
    "fmt"

    webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)

func main() {
    client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
    result, _ := client.Fields(context.Background(), &webscrapingai.FieldsOptions{
        URL: "https://medical-reference.com/glossary",
        Fields: map[string]string{
            "terms": "Array of {term, definition, usage_example} objects for each medical term",
        },
    })
    fmt.Println(result.Result)
}
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.FieldsOptions;
import ai.webscraping.result.FieldsResult;

Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
FieldsResult result = client.fields(FieldsOptions.builder()
    .url("https://medical-reference.com/glossary")
    .addField("terms", "Array of {term, definition, usage_example} objects for each medical term")
    .build());
System.out.println(result.getResult());
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;

var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var result = await client.FieldsAsync(new FieldsRequest {
    Url = "https://medical-reference.com/glossary",
    Fields = new Dictionary<string, string> {
        ["terms"] = "Array of {term, definition, usage_example} objects for each medical term",
    },
});
Console.WriteLine(result.Result);

Why Use WebScraping.AI

Format Flexibility: Output data in any format your training pipeline needs.
AI Transformation: Convert content into instruction format automatically.
Quality Control: Extract only relevant, high-quality examples.
Domain Expertise: Collect from authoritative sources in your field.
Scale: Build datasets with thousands of training examples.

Fine-tuning Applications

Domain-Specific Chatbots

Train models that understand your industry terminology

Code Assistants

Fine-tune on documentation and code examples

Customer Support

Train on support conversations and FAQ data

Content Generation

Fine-tune for your brand voice and style

Related Use Cases

More AI & ML solutions

Start Collecting Fine-tuning Data

Get started with 1,000 free API credits. No credit card required.

Icon