16bb195cb5
- Update CLAUDE.md: replace incorrect 'no XML doc on internal code' rule with the correct convention (XML doc on all public methods and non-trivial private/protected helpers) - Restore /// <summary> on FileDownloadController private helpers (HandleRangeRequest, StreamRangeAsync) - Add full XML doc to all service contracts: ICaptchaVerifier, IEmailSender, ICvMatcherService, IJobTextExtractor, IJobTokenService, IDocumentClassifier, IRagService, ITextChunker, ITextExtractor, IEmailTemplateService, ITemplateService - Add /// <summary> and /// <inheritdoc /> to all concrete service classes and their methods: RecaptchaVerifier, EmailApiEmailSender, SmtpEmailDispatcher, CvMatcherService, JobTextExtractor, JobTokenService, RagService, DocumentClassifier, TextChunker, TextExtractor, HtmlJobSearcher, CvSearchEmailSender, CvSearchJobTask, EmailTemplateService, DbTemplateService Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
33 lines
969 B
C#
33 lines
969 B
C#
using System.Text;
|
|
using Api.Services.Contracts;
|
|
using UglyToad.PdfPig;
|
|
|
|
namespace Api.Services;
|
|
|
|
/// <summary>
|
|
/// Extracts and normalises plain text from PDF files using PdfPig.
|
|
/// </summary>
|
|
public sealed class TextExtractor : ITextExtractor
|
|
{
|
|
/// <inheritdoc />
|
|
public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
|
|
{
|
|
using var document = PdfDocument.Open(stream);
|
|
var builder = new StringBuilder();
|
|
foreach (var page in document.GetPages())
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
builder.AppendLine(page.Text);
|
|
builder.AppendLine();
|
|
}
|
|
return Task.FromResult(Normalize(builder.ToString()));
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public string Normalize(string value)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
|
return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
|
|
}
|
|
}
|