using System.Text;
using Api.Services.Contracts;
using UglyToad.PdfPig;
namespace Api.Services;
///
/// Extracts and normalises plain text from PDF files using PdfPig.
///
public sealed class TextExtractor : ITextExtractor
{
///
public Task ExtractPdfAsync(Stream stream, CancellationToken ct)
{
using var document = PdfDocument.Open(stream);
var builder = new StringBuilder();
foreach (var page in document.GetPages())
{
ct.ThrowIfCancellationRequested();
builder.AppendLine(page.Text);
builder.AppendLine();
}
return Task.FromResult(Normalize(builder.ToString()));
}
///
public string Normalize(string value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
}
}