34 lines
855 B
C#
34 lines
855 B
C#
using System.Text;
|
|
using UglyToad.PdfPig;
|
|
|
|
namespace Api.Services.Rag;
|
|
|
|
public interface IPdfTextExtractor
|
|
{
|
|
string ExtractText(Stream pdfStream);
|
|
}
|
|
|
|
public sealed class PdfTextExtractor : IPdfTextExtractor
|
|
{
|
|
public string ExtractText(Stream pdfStream)
|
|
{
|
|
using var document = PdfDocument.Open(pdfStream);
|
|
var builder = new StringBuilder();
|
|
|
|
foreach (var page in document.GetPages())
|
|
{
|
|
builder.AppendLine(page.Text);
|
|
builder.AppendLine();
|
|
}
|
|
|
|
return NormalizeWhitespace(builder.ToString());
|
|
}
|
|
|
|
private static string NormalizeWhitespace(string value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
|
var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
|
|
return string.Join(' ', parts).Trim();
|
|
}
|
|
}
|