Build comprehensive knowledge bases for retrieval-augmented generation. Power your AI chatbots and assistants with structured web content.
Retrieval-augmented generation systems are only as good as their knowledge base. Building a comprehensive, up-to-date corpus requires efficient web content extraction.
Static documents become outdated quickly. You need automated collection of fresh content from documentation sites, knowledge bases, and authoritative sources.
Everything you need for RAG systems
Extract main content without navigation, ads, or boilerplate.
Preserve document structure with headings and sections.
Extract specific information using natural language queries.
Maintain source URLs for citation and verification.
Build your RAG knowledge base
curl -G "https://api.webscraping.ai/ai/fields" \
--data-urlencode "api_key=YOUR_API_KEY" \
--data-urlencode "url=https://docs.example.com/api/authentication" \
--data-urlencode "fields[title]=Page title" \
--data-urlencode "fields[main_content]=Main content text without navigation" \
--data-urlencode "fields[sections]=Array of section headings and their content" \
--data-urlencode "fields[code_examples]=Any code snippets on the page" \
--data-urlencode "fields[key_concepts]=Key concepts or terms defined" \
--data-urlencode "fields[related_topics]=Links to related documentation pages"
# Response:
# {
# "title": "API Authentication Guide",
# "main_content": "This guide covers authentication methods...",
# "sections": [
# {"heading": "API Keys", "content": "API keys are..."},
# {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
# ],
# "code_examples": ["curl -H 'Authorization: Bearer...'"],
# "key_concepts": ["API key", "Bearer token", "OAuth scope"],
# "related_topics": ["/docs/rate-limits", "/docs/errors"]
# }
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client
client = Client(api_key="YOUR_API_KEY")
result = client.fields(
"https://docs.example.com/api/authentication",
fields={
"title": "Page title",
"main_content": "Main content text without navigation",
"sections": "Array of section headings and their content",
"code_examples": "Any code snippets on the page",
"key_concepts": "Key concepts or terms defined",
"related_topics": "Links to related documentation pages",
},
)
print(result)
# Response:
# {
# "title": "API Authentication Guide",
# "main_content": "This guide covers authentication methods...",
# "sections": [
# {"heading": "API Keys", "content": "API keys are..."},
# {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
# ],
# "code_examples": ["curl -H 'Authorization: Bearer...'"],
# "key_concepts": ["API key", "Bearer token", "OAuth scope"],
# "related_topics": ["/docs/rate-limits", "/docs/errors"]
# }
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';
const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const result = await client.fields({
url: 'https://docs.example.com/api/authentication',
fields: {
title: 'Page title',
main_content: 'Main content text without navigation',
sections: 'Array of section headings and their content',
code_examples: 'Any code snippets on the page',
key_concepts: 'Key concepts or terms defined',
related_topics: 'Links to related documentation pages',
},
});
console.log(result);
// Response:
// {
// "title": "API Authentication Guide",
// "main_content": "This guide covers authentication methods...",
// "sections": [
// {"heading": "API Keys", "content": "API keys are..."},
// {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
// ],
// "code_examples": ["curl -H 'Authorization: Bearer...'"],
// "key_concepts": ["API key", "Bearer token", "OAuth scope"],
// "related_topics": ["/docs/rate-limits", "/docs/errors"]
// }
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';
use WebScrapingAI\Client;
$client = new Client('YOUR_API_KEY');
$result = $client->fields('https://docs.example.com/api/authentication', [
'title' => 'Page title',
'main_content' => 'Main content text without navigation',
'sections' => 'Array of section headings and their content',
'code_examples' => 'Any code snippets on the page',
'key_concepts' => 'Key concepts or terms defined',
'related_topics' => 'Links to related documentation pages',
]);
print_r($result);
// Response:
// {
// "title": "API Authentication Guide",
// "main_content": "This guide covers authentication methods...",
// "sections": [
// {"heading": "API Keys", "content": "API keys are..."},
// {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
// ],
// "code_examples": ["curl -H 'Authorization: Bearer...'"],
// "key_concepts": ["API key", "Bearer token", "OAuth scope"],
// "related_topics": ["/docs/rate-limits", "/docs/errors"]
// }
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'
client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
result = client.fields(
'https://docs.example.com/api/authentication',
fields: {
title: 'Page title',
main_content: 'Main content text without navigation',
sections: 'Array of section headings and their content',
code_examples: 'Any code snippets on the page',
key_concepts: 'Key concepts or terms defined',
related_topics: 'Links to related documentation pages',
}
)
puts result.inspect
# Response:
# {
# "title": "API Authentication Guide",
# "main_content": "This guide covers authentication methods...",
# "sections": [
# {"heading": "API Keys", "content": "API keys are..."},
# {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
# ],
# "code_examples": ["curl -H 'Authorization: Bearer...'"],
# "key_concepts": ["API key", "Bearer token", "OAuth scope"],
# "related_topics": ["/docs/rate-limits", "/docs/errors"]
# }
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main
import (
"context"
"fmt"
webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)
func main() {
client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
result, _ := client.Fields(context.Background(), &webscrapingai.FieldsOptions{
URL: "https://docs.example.com/api/authentication",
Fields: map[string]string{
"title": "Page title",
"main_content": "Main content text without navigation",
"sections": "Array of section headings and their content",
"code_examples": "Any code snippets on the page",
"key_concepts": "Key concepts or terms defined",
"related_topics": "Links to related documentation pages",
},
})
fmt.Println(result.Result)
}
// Response:
// {
// "title": "API Authentication Guide",
// "main_content": "This guide covers authentication methods...",
// "sections": [
// {"heading": "API Keys", "content": "API keys are..."},
// {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
// ],
// "code_examples": ["curl -H 'Authorization: Bearer...'"],
// "key_concepts": ["API key", "Bearer token", "OAuth scope"],
// "related_topics": ["/docs/rate-limits", "/docs/errors"]
// }
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.FieldsOptions;
import ai.webscraping.result.FieldsResult;
Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
FieldsResult result = client.fields(FieldsOptions.builder()
.url("https://docs.example.com/api/authentication")
.addField("title", "Page title")
.addField("main_content", "Main content text without navigation")
.addField("sections", "Array of section headings and their content")
.addField("code_examples", "Any code snippets on the page")
.addField("key_concepts", "Key concepts or terms defined")
.addField("related_topics", "Links to related documentation pages")
.build());
System.out.println(result.getResult());
// Response:
// {
// "title": "API Authentication Guide",
// "main_content": "This guide covers authentication methods...",
// "sections": [
// {"heading": "API Keys", "content": "API keys are..."},
// {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
// ],
// "code_examples": ["curl -H 'Authorization: Bearer...'"],
// "key_concepts": ["API key", "Bearer token", "OAuth scope"],
// "related_topics": ["/docs/rate-limits", "/docs/errors"]
// }
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;
var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var result = await client.FieldsAsync(new FieldsRequest {
Url = "https://docs.example.com/api/authentication",
Fields = new Dictionary<string, string> {
["title"] = "Page title",
["main_content"] = "Main content text without navigation",
["sections"] = "Array of section headings and their content",
["code_examples"] = "Any code snippets on the page",
["key_concepts"] = "Key concepts or terms defined",
["related_topics"] = "Links to related documentation pages",
},
});
Console.WriteLine(result.Result);
// Response:
// {
// "title": "API Authentication Guide",
// "main_content": "This guide covers authentication methods...",
// "sections": [
// {"heading": "API Keys", "content": "API keys are..."},
// {"heading": "OAuth 2.0", "content": "For OAuth flow..."}
// ],
// "code_examples": ["curl -H 'Authorization: Bearer...'"],
// "key_concepts": ["API key", "Bearer token", "OAuth scope"],
// "related_topics": ["/docs/rate-limits", "/docs/errors"]
// }
curl -G "https://api.webscraping.ai/ai/question" \
--data-urlencode "api_key=YOUR_API_KEY" \
--data-urlencode "url=https://docs.example.com/api/authentication" \
--data-urlencode "question=Provide a 2-3 sentence summary of this page suitable for a knowledge base index."
# pip install webscraping_ai
# https://pypi.org/project/webscraping-ai/
from webscraping_ai import Client
client = Client(api_key="YOUR_API_KEY")
answer = client.question(
"https://docs.example.com/api/authentication",
question="Provide a 2-3 sentence summary of this page suitable for a knowledge base index.",
)
print(answer)
// npm install webscraping-ai
// https://www.npmjs.com/package/webscraping-ai
import { WebScrapingAI } from 'webscraping-ai';
const client = new WebScrapingAI({ apiKey: 'YOUR_API_KEY' });
const answer = await client.question({
url: 'https://docs.example.com/api/authentication',
question: 'Provide a 2-3 sentence summary of this page suitable for a knowledge base index.',
});
console.log(answer);
<?php
// composer require webscraping-ai/webscraping-ai-php
// https://packagist.org/packages/webscraping-ai/webscraping-ai-php
require 'vendor/autoload.php';
use WebScrapingAI\Client;
$client = new Client('YOUR_API_KEY');
$answer = $client->question(
'https://docs.example.com/api/authentication',
'Provide a 2-3 sentence summary of this page suitable for a knowledge base index.',
);
echo $answer;
# gem install webscraping_ai
# https://rubygems.org/gems/webscraping_ai
require 'webscraping_ai'
client = WebScrapingAI::Client.new(api_key: 'YOUR_API_KEY')
answer = client.question(
'https://docs.example.com/api/authentication',
question: 'Provide a 2-3 sentence summary of this page suitable for a knowledge base index.'
)
puts answer
// go get github.com/webscraping-ai/webscraping-ai-go/v4
// https://pkg.go.dev/github.com/webscraping-ai/webscraping-ai-go/v4
package main
import (
"context"
"fmt"
webscrapingai "github.com/webscraping-ai/webscraping-ai-go/v4"
)
func main() {
client, _ := webscrapingai.NewClient(&webscrapingai.Config{APIKey: "YOUR_API_KEY"})
answer, _ := client.Question(context.Background(), &webscrapingai.QuestionOptions{
URL: "https://docs.example.com/api/authentication",
Question: "Provide a 2-3 sentence summary of this page suitable for a knowledge base index.",
})
fmt.Println(answer)
}
// Maven: ai.webscraping:webscraping-ai:4.0.0
// https://central.sonatype.com/artifact/ai.webscraping/webscraping-ai
import ai.webscraping.Client;
import ai.webscraping.Config;
import ai.webscraping.option.QuestionOptions;
Client client = new Client(Config.builder().apiKey("YOUR_API_KEY").build());
String answer = client.question(QuestionOptions.builder()
.url("https://docs.example.com/api/authentication")
.question("Provide a 2-3 sentence summary of this page suitable for a knowledge base index.")
.build());
System.out.println(answer);
// dotnet add package WebScrapingAI
// https://www.nuget.org/packages/WebScrapingAI
using WebScrapingAI;
var client = new WebScrapingAIClient(new WebScrapingAIClientOptions { ApiKey = "YOUR_API_KEY" });
var answer = await client.QuestionAsync(new QuestionRequest {
Url = "https://docs.example.com/api/authentication",
Question = "Provide a 2-3 sentence summary of this page suitable for a knowledge base index.",
});
Console.WriteLine(answer);
Build knowledge bases from help docs and FAQs
Index company documentation and wikis
Collect and index research papers and articles
Create searchable product knowledge bases
Get started with 1,000 free API credits. No credit card required.