16bb195cb5
- Update CLAUDE.md: replace incorrect 'no XML doc on internal code' rule with the correct convention (XML doc on all public methods and non-trivial private/protected helpers) - Restore /// <summary> on FileDownloadController private helpers (HandleRangeRequest, StreamRangeAsync) - Add full XML doc to all service contracts: ICaptchaVerifier, IEmailSender, ICvMatcherService, IJobTextExtractor, IJobTokenService, IDocumentClassifier, IRagService, ITextChunker, ITextExtractor, IEmailTemplateService, ITemplateService - Add /// <summary> and /// <inheritdoc /> to all concrete service classes and their methods: RecaptchaVerifier, EmailApiEmailSender, SmtpEmailDispatcher, CvMatcherService, JobTextExtractor, JobTokenService, RagService, DocumentClassifier, TextChunker, TextExtractor, HtmlJobSearcher, CvSearchEmailSender, CvSearchJobTask, EmailTemplateService, DbTemplateService Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
24 lines
957 B
C#
24 lines
957 B
C#
namespace Api.Services.Contracts;
|
|
|
|
/// <summary>
|
|
/// Extracts and normalises plain text from documents.
|
|
/// </summary>
|
|
public interface ITextExtractor
|
|
{
|
|
/// <summary>
|
|
/// Reads all pages of a PDF stream and returns the concatenated, normalised plain text.
|
|
/// </summary>
|
|
/// <param name="stream">Readable stream positioned at the start of the PDF file.</param>
|
|
/// <param name="ct">Cancellation token (checked between pages).</param>
|
|
/// <returns>Normalised plain text extracted from the PDF.</returns>
|
|
Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct);
|
|
|
|
/// <summary>
|
|
/// Collapses all whitespace sequences in <paramref name="value"/> to single spaces and trims the result.
|
|
/// Returns an empty string for null/whitespace input.
|
|
/// </summary>
|
|
/// <param name="value">Raw text to normalise.</param>
|
|
/// <returns>Whitespace-normalised text.</returns>
|
|
string Normalize(string value);
|
|
}
|