Files
myAi/Apis/rag-api/Services/TextExtractor.cs
claude 16bb195cb5 Add XML doc to all service interfaces and implementations (#26)
- Update CLAUDE.md: replace incorrect 'no XML doc on internal code' rule
  with the correct convention (XML doc on all public methods and
  non-trivial private/protected helpers)
- Restore /// <summary> on FileDownloadController private helpers
  (HandleRangeRequest, StreamRangeAsync)
- Add full XML doc to all service contracts:
  ICaptchaVerifier, IEmailSender, ICvMatcherService, IJobTextExtractor,
  IJobTokenService, IDocumentClassifier, IRagService, ITextChunker,
  ITextExtractor, IEmailTemplateService, ITemplateService
- Add /// <summary> and /// <inheritdoc /> to all concrete service classes
  and their methods: RecaptchaVerifier, EmailApiEmailSender,
  SmtpEmailDispatcher, CvMatcherService, JobTextExtractor, JobTokenService,
  RagService, DocumentClassifier, TextChunker, TextExtractor,
  HtmlJobSearcher, CvSearchEmailSender, CvSearchJobTask,
  EmailTemplateService, DbTemplateService

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 09:17:42 +03:00

33 lines
969 B
C#

using System.Text;
using Api.Services.Contracts;
using UglyToad.PdfPig;
namespace Api.Services;
/// <summary>
/// Extracts and normalises plain text from PDF files using PdfPig.
/// </summary>
public sealed class TextExtractor : ITextExtractor
{
/// <inheritdoc />
public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
{
using var document = PdfDocument.Open(stream);
var builder = new StringBuilder();
foreach (var page in document.GetPages())
{
ct.ThrowIfCancellationRequested();
builder.AppendLine(page.Text);
builder.AppendLine();
}
return Task.FromResult(Normalize(builder.ToString()));
}
/// <inheritdoc />
public string Normalize(string value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
}
}