16bb195cb5
- Update CLAUDE.md: replace incorrect 'no XML doc on internal code' rule with the correct convention (XML doc on all public methods and non-trivial private/protected helpers) - Restore /// <summary> on FileDownloadController private helpers (HandleRangeRequest, StreamRangeAsync) - Add full XML doc to all service contracts: ICaptchaVerifier, IEmailSender, ICvMatcherService, IJobTextExtractor, IJobTokenService, IDocumentClassifier, IRagService, ITextChunker, ITextExtractor, IEmailTemplateService, ITemplateService - Add /// <summary> and /// <inheritdoc /> to all concrete service classes and their methods: RecaptchaVerifier, EmailApiEmailSender, SmtpEmailDispatcher, CvMatcherService, JobTextExtractor, JobTokenService, RagService, DocumentClassifier, TextChunker, TextExtractor, HtmlJobSearcher, CvSearchEmailSender, CvSearchJobTask, EmailTemplateService, DbTemplateService Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
31 lines
1.1 KiB
C#
31 lines
1.1 KiB
C#
using Api.Services.Contracts;
|
|
|
|
namespace Api.Services;
|
|
|
|
/// <summary>
|
|
/// Splits text into overlapping fixed-size chunks using a sliding window for use in vector embedding pipelines.
|
|
/// </summary>
|
|
public sealed class TextChunker : ITextChunker
|
|
{
|
|
/// <inheritdoc />
|
|
public IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(text)) return [];
|
|
chunkSize = Math.Clamp(chunkSize, 300, 3000);
|
|
overlap = Math.Clamp(overlap, 0, chunkSize / 2);
|
|
|
|
// Sliding window: step forward by (chunkSize - overlap) each iteration so
|
|
// adjacent chunks share `overlap` characters, preserving cross-boundary context.
|
|
var chunks = new List<string>();
|
|
var start = 0;
|
|
while (start < text.Length)
|
|
{
|
|
var length = Math.Min(chunkSize, text.Length - start);
|
|
var chunk = text.Substring(start, length).Trim();
|
|
if (!string.IsNullOrWhiteSpace(chunk)) chunks.Add(chunk);
|
|
start += chunkSize - overlap;
|
|
}
|
|
return chunks;
|
|
}
|
|
}
|