This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
using System.Text;
|
||||
using Api.Services.Contracts;
|
||||
using UglyToad.PdfPig;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
public sealed class TextExtractor : ITextExtractor
|
||||
{
|
||||
public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
|
||||
{
|
||||
using var document = PdfDocument.Open(stream);
|
||||
var builder = new StringBuilder();
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
builder.AppendLine(page.Text);
|
||||
builder.AppendLine();
|
||||
}
|
||||
return Task.FromResult(Normalize(builder.ToString()));
|
||||
}
|
||||
|
||||
public string Normalize(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user