@@ -0,0 +1,33 @@
|
||||
using System.Text;
|
||||
using UglyToad.PdfPig;
|
||||
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface IPdfTextExtractor
|
||||
{
|
||||
string ExtractText(Stream pdfStream);
|
||||
}
|
||||
|
||||
public sealed class PdfTextExtractor : IPdfTextExtractor
|
||||
{
|
||||
public string ExtractText(Stream pdfStream)
|
||||
{
|
||||
using var document = PdfDocument.Open(pdfStream);
|
||||
var builder = new StringBuilder();
|
||||
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
builder.AppendLine(page.Text);
|
||||
builder.AppendLine();
|
||||
}
|
||||
|
||||
return NormalizeWhitespace(builder.ToString());
|
||||
}
|
||||
|
||||
private static string NormalizeWhitespace(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
|
||||
return string.Join(' ', parts).Trim();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user