28 lines
826 B
C#
28 lines
826 B
C#
using System.Text;
|
|
using Api.Services.Contracts;
|
|
using UglyToad.PdfPig;
|
|
|
|
namespace Api.Services;
|
|
|
|
public sealed class TextExtractor : ITextExtractor
|
|
{
|
|
public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
|
|
{
|
|
using var document = PdfDocument.Open(stream);
|
|
var builder = new StringBuilder();
|
|
foreach (var page in document.GetPages())
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
builder.AppendLine(page.Text);
|
|
builder.AppendLine();
|
|
}
|
|
return Task.FromResult(Normalize(builder.ToString()));
|
|
}
|
|
|
|
public string Normalize(string value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
|
return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
|
|
}
|
|
}
|