Files
myAi/Apis/rag-api/Services/TextChunker.cs
T
claude 16bb195cb5 Add XML doc to all service interfaces and implementations (#26)
- Update CLAUDE.md: replace incorrect 'no XML doc on internal code' rule
  with the correct convention (XML doc on all public methods and
  non-trivial private/protected helpers)
- Restore /// <summary> on FileDownloadController private helpers
  (HandleRangeRequest, StreamRangeAsync)
- Add full XML doc to all service contracts:
  ICaptchaVerifier, IEmailSender, ICvMatcherService, IJobTextExtractor,
  IJobTokenService, IDocumentClassifier, IRagService, ITextChunker,
  ITextExtractor, IEmailTemplateService, ITemplateService
- Add /// <summary> and /// <inheritdoc /> to all concrete service classes
  and their methods: RecaptchaVerifier, EmailApiEmailSender,
  SmtpEmailDispatcher, CvMatcherService, JobTextExtractor, JobTokenService,
  RagService, DocumentClassifier, TextChunker, TextExtractor,
  HtmlJobSearcher, CvSearchEmailSender, CvSearchJobTask,
  EmailTemplateService, DbTemplateService

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 09:17:42 +03:00

31 lines
1.1 KiB
C#

using Api.Services.Contracts;
namespace Api.Services;
/// <summary>
/// Splits text into overlapping fixed-size chunks using a sliding window for use in vector embedding pipelines.
/// </summary>
public sealed class TextChunker : ITextChunker
{
/// <inheritdoc />
public IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap)
{
if (string.IsNullOrWhiteSpace(text)) return [];
chunkSize = Math.Clamp(chunkSize, 300, 3000);
overlap = Math.Clamp(overlap, 0, chunkSize / 2);
// Sliding window: step forward by (chunkSize - overlap) each iteration so
// adjacent chunks share `overlap` characters, preserving cross-boundary context.
var chunks = new List<string>();
var start = 0;
while (start < text.Length)
{
var length = Math.Min(chunkSize, text.Length - start);
var chunk = text.Substring(start, length).Trim();
if (!string.IsNullOrWhiteSpace(chunk)) chunks.Add(chunk);
start += chunkSize - overlap;
}
return chunks;
}
}