Files
myAi/api/Services/Rag/PdfTextExtractor.cs
T
claude 2dce2ab0ff
Build and Push Docker Images / build (push) Successful in 42s
Changes
2026-05-04 15:56:15 +03:00

34 lines
855 B
C#

using System.Text;
using UglyToad.PdfPig;
namespace Api.Services.Rag;
public interface IPdfTextExtractor
{
string ExtractText(Stream pdfStream);
}
public sealed class PdfTextExtractor : IPdfTextExtractor
{
public string ExtractText(Stream pdfStream)
{
using var document = PdfDocument.Open(pdfStream);
var builder = new StringBuilder();
foreach (var page in document.GetPages())
{
builder.AppendLine(page.Text);
builder.AppendLine();
}
return NormalizeWhitespace(builder.ToString());
}
private static string NormalizeWhitespace(string value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
return string.Join(' ', parts).Trim();
}
}