Extracting data from PDF files during web scraping is a common challenge. This comprehensive guide covers downloading PDFs, extracting text and structured data, and handling complex PDF formats using C#.
Overview
PDF extraction in web scraping involves three main steps: 1. Download PDF files from web sources 2. Parse PDF content using specialized libraries 3. Process extracted data according to your requirements
Step 1: Download PDF Files
Basic PDF Download with HttpClient
using System.Net.Http;
using System.IO;
using System.Threading.Tasks;
public class PdfDownloader
{
private readonly HttpClient _httpClient;
public PdfDownloader()
{
_httpClient = new HttpClient();
// Set appropriate headers to avoid blocking
_httpClient.DefaultRequestHeaders.Add("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
}
public async Task<string> DownloadPdfAsync(string pdfUrl, string downloadDirectory)
{
try
{
var response = await _httpClient.GetAsync(pdfUrl);
response.EnsureSuccessStatusCode();
var fileName = Path.GetFileName(new Uri(pdfUrl).LocalPath) ??
$"document_{DateTime.Now:yyyyMMdd_HHmmss}.pdf";
var localPath = Path.Combine(downloadDirectory, fileName);
var pdfBytes = await response.Content.ReadAsByteArrayAsync();
await File.WriteAllBytesAsync(localPath, pdfBytes);
return localPath;
}
catch (HttpRequestException ex)
{
throw new Exception($"Failed to download PDF: {ex.Message}", ex);
}
}
public void Dispose() => _httpClient?.Dispose();
}
Download with Progress Tracking
public async Task<string> DownloadPdfWithProgressAsync(string pdfUrl, string localPath,
IProgress<long> progress = null)
{
using var response = await _httpClient.GetAsync(pdfUrl, HttpCompletionOption.ResponseHeadersRead);
response.EnsureSuccessStatusCode();
var totalBytes = response.Content.Headers.ContentLength ?? 0;
var downloadedBytes = 0L;
using var contentStream = await response.Content.ReadAsStreamAsync();
using var fileStream = new FileStream(localPath, FileMode.Create, FileAccess.Write, FileShare.None);
var buffer = new byte[8192];
int bytesRead;
while ((bytesRead = await contentStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
await fileStream.WriteAsync(buffer, 0, bytesRead);
downloadedBytes += bytesRead;
progress?.Report(downloadedBytes);
}
return localPath;
}
Step 2: Extract Text from PDFs
Using iText 7 (Recommended)
Install the package:
Install-Package itext7
Basic text extraction:
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using System.Text;
public class PdfTextExtractor
{
public string ExtractAllText(string pdfPath)
{
var text = new StringBuilder();
using var pdfReader = new PdfReader(pdfPath);
using var pdfDocument = new PdfDocument(pdfReader);
for (int page = 1; page <= pdfDocument.GetNumberOfPages(); page++)
{
var strategy = new SimpleTextExtractionStrategy();
var pageText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(page), strategy);
text.AppendLine($"--- Page {page} ---");
text.AppendLine(pageText);
}
return text.ToString();
}
public Dictionary<int, string> ExtractTextByPage(string pdfPath)
{
var pageTexts = new Dictionary<int, string>();
using var pdfReader = new PdfReader(pdfPath);
using var pdfDocument = new PdfDocument(pdfReader);
for (int page = 1; page <= pdfDocument.GetNumberOfPages(); page++)
{
var pageText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(page));
pageTexts[page] = pageText;
}
return pageTexts;
}
}
Advanced Text Extraction with Positioning
using iText.Kernel.Pdf.Canvas.Parser.Listener;
public class LocationTextExtractionStrategy : ITextExtractionStrategy
{
private readonly List<TextChunk> _chunks = new List<TextChunk>();
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderText(TextRenderInfo renderInfo)
{
var chunk = new TextChunk
{
Text = renderInfo.GetText(),
StartLocation = renderInfo.GetStartPoint(),
EndLocation = renderInfo.GetEndPoint()
};
_chunks.Add(chunk);
}
public string GetResultantText()
{
// Sort chunks by vertical position, then horizontal
var sortedChunks = _chunks
.OrderByDescending(c => c.StartLocation.Get(1)) // Y coordinate (top to bottom)
.ThenBy(c => c.StartLocation.Get(0)) // X coordinate (left to right)
.ToList();
return string.Join(" ", sortedChunks.Select(c => c.Text));
}
public void RenderImage(ImageRenderInfo renderInfo) { }
private class TextChunk
{
public string Text { get; set; }
public iText.Kernel.Geom.Vector StartLocation { get; set; }
public iText.Kernel.Geom.Vector EndLocation { get; set; }
}
}
Step 3: Handle Tables and Structured Data
Extract Tables with PdfPig
Install PdfPig:
Install-Package PdfPig
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
public class PdfTableExtractor
{
public List<List<string>> ExtractTablesFromPage(string pdfPath, int pageNumber)
{
var tables = new List<List<string>>();
using var document = PdfDocument.Open(pdfPath);
var page = document.GetPage(pageNumber);
// Group words into potential table cells based on positioning
var words = page.GetWords().OrderBy(w => w.BoundingBox.Bottom).ThenBy(w => w.BoundingBox.Left);
var rows = GroupWordsIntoRows(words);
foreach (var row in rows)
{
var cellTexts = row.Select(word => word.Text).ToList();
tables.Add(cellTexts);
}
return tables;
}
private List<List<Word>> GroupWordsIntoRows(IEnumerable<Word> words)
{
var rows = new List<List<Word>>();
var currentRow = new List<Word>();
var lastBottom = double.MaxValue;
foreach (var word in words)
{
// If word is significantly lower, start a new row
if (Math.Abs(word.BoundingBox.Bottom - lastBottom) > 5)
{
if (currentRow.Any())
{
rows.Add(currentRow.OrderBy(w => w.BoundingBox.Left).ToList());
currentRow = new List<Word>();
}
}
currentRow.Add(word);
lastBottom = word.BoundingBox.Bottom;
}
if (currentRow.Any())
{
rows.Add(currentRow.OrderBy(w => w.BoundingBox.Left).ToList());
}
return rows;
}
}
Step 4: Process and Structure Extracted Data
Using Regular Expressions for Pattern Matching
using System.Text.RegularExpressions;
public class PdfDataProcessor
{
public List<Invoice> ExtractInvoices(string pdfText)
{
var invoices = new List<Invoice>();
// Pattern for invoice number
var invoicePattern = @"Invoice\s+(?:Number|No\.?):\s*([A-Z0-9-]+)";
var datePattern = @"Date:\s*(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})";
var amountPattern = @"Total:\s*\$?([\d,]+\.?\d*)";
var invoiceMatches = Regex.Matches(pdfText, invoicePattern, RegexOptions.IgnoreCase);
var dateMatches = Regex.Matches(pdfText, datePattern);
var amountMatches = Regex.Matches(pdfText, amountPattern);
for (int i = 0; i < invoiceMatches.Count; i++)
{
var invoice = new Invoice
{
Number = invoiceMatches[i].Groups[1].Value,
Date = i < dateMatches.Count ? DateTime.Parse(dateMatches[i].Groups[1].Value) : DateTime.MinValue,
Amount = i < amountMatches.Count ? decimal.Parse(amountMatches[i].Groups[1].Value.Replace(",", "")) : 0
};
invoices.Add(invoice);
}
return invoices;
}
public Dictionary<string, string> ExtractKeyValuePairs(string text)
{
var pairs = new Dictionary<string, string>();
// Pattern for "Label: Value" format
var pattern = @"([A-Za-z\s]+):\s*([^\r\n]+)";
var matches = Regex.Matches(text, pattern);
foreach (Match match in matches)
{
var key = match.Groups[1].Value.Trim();
var value = match.Groups[2].Value.Trim();
pairs[key] = value;
}
return pairs;
}
}
public class Invoice
{
public string Number { get; set; }
public DateTime Date { get; set; }
public decimal Amount { get; set; }
}
LINQ for Data Filtering and Processing
public class PdfAnalyzer
{
public List<string> FindLinesContaining(string text, params string[] keywords)
{
return text.Split('\n')
.Where(line => keywords.Any(keyword =>
line.Contains(keyword, StringComparison.OrdinalIgnoreCase)))
.Select(line => line.Trim())
.Where(line => !string.IsNullOrEmpty(line))
.ToList();
}
public decimal ExtractNumbers(string text)
{
var numberPattern = @"\d+\.?\d*";
var matches = Regex.Matches(text, numberPattern);
return matches.Cast<Match>()
.Select(m => decimal.TryParse(m.Value, out var num) ? num : 0)
.Sum();
}
}
Complete Example: PDF Web Scraper
public class PdfWebScraper
{
private readonly PdfDownloader _downloader;
private readonly PdfTextExtractor _textExtractor;
private readonly PdfDataProcessor _processor;
public PdfWebScraper()
{
_downloader = new PdfDownloader();
_textExtractor = new PdfTextExtractor();
_processor = new PdfDataProcessor();
}
public async Task<List<Invoice>> ScrapePdfInvoices(string[] pdfUrls)
{
var allInvoices = new List<Invoice>();
var downloadDirectory = Path.Combine(Path.GetTempPath(), "pdf_scraping");
Directory.CreateDirectory(downloadDirectory);
foreach (var url in pdfUrls)
{
try
{
// Download PDF
var localPath = await _downloader.DownloadPdfAsync(url, downloadDirectory);
// Extract text
var text = _textExtractor.ExtractAllText(localPath);
// Process and extract structured data
var invoices = _processor.ExtractInvoices(text);
allInvoices.AddRange(invoices);
// Clean up downloaded file
File.Delete(localPath);
}
catch (Exception ex)
{
Console.WriteLine($"Error processing {url}: {ex.Message}");
}
}
return allInvoices;
}
}
Best Practices and Considerations
Error Handling
- Always wrap PDF operations in try-catch blocks
- Handle corrupted or password-protected PDFs gracefully
- Implement retry logic for network operations
Performance Optimization
- Process PDFs in parallel for large batches
- Use memory streams for temporary PDF processing
- Dispose of resources properly to avoid memory leaks
Text Quality Issues
- PDFs with scanned images require OCR (consider Tesseract.NET)
- Some PDFs may have text in unusual encodings
- Complex layouts might require custom extraction strategies
Memory Management
public async Task ProcessLargePdfBatch(string[] urls)
{
var semaphore = new SemaphoreSlim(5); // Limit concurrent processing
var tasks = urls.Select(async url =>
{
await semaphore.WaitAsync();
try
{
// Process PDF
await ProcessSinglePdf(url);
}
finally
{
semaphore.Release();
}
});
await Task.WhenAll(tasks);
}
This comprehensive approach to PDF data extraction provides robust solutions for various PDF formats and use cases in web scraping scenarios.