Add XML doc to all service interfaces and implementations (#26)
- Update CLAUDE.md: replace incorrect 'no XML doc on internal code' rule with the correct convention (XML doc on all public methods and non-trivial private/protected helpers) - Restore /// <summary> on FileDownloadController private helpers (HandleRangeRequest, StreamRangeAsync) - Add full XML doc to all service contracts: ICaptchaVerifier, IEmailSender, ICvMatcherService, IJobTextExtractor, IJobTokenService, IDocumentClassifier, IRagService, ITextChunker, ITextExtractor, IEmailTemplateService, ITemplateService - Add /// <summary> and /// <inheritdoc /> to all concrete service classes and their methods: RecaptchaVerifier, EmailApiEmailSender, SmtpEmailDispatcher, CvMatcherService, JobTextExtractor, JobTokenService, RagService, DocumentClassifier, TextChunker, TextExtractor, HtmlJobSearcher, CvSearchEmailSender, CvSearchJobTask, EmailTemplateService, DbTemplateService Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,20 @@ using Rag.Models;
|
||||
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Classifies a document into a known type (cv, job, contract, etc.) and extracts a title.
|
||||
/// </summary>
|
||||
public interface IDocumentClassifier
|
||||
{
|
||||
/// <summary>
|
||||
/// Determines the document type and title from the provided text.
|
||||
/// Uses <paramref name="providedType"/> and <paramref name="providedTitle"/> directly when supplied;
|
||||
/// otherwise falls back to a keyword-frequency heuristic over the text.
|
||||
/// </summary>
|
||||
/// <param name="text">Full document text to classify.</param>
|
||||
/// <param name="providedType">Caller-supplied document type hint; skips heuristic when non-empty.</param>
|
||||
/// <param name="providedTitle">Caller-supplied document title; skips title extraction when non-empty.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A <see cref="DocumentClassification"/> with type, confidence score, and title.</returns>
|
||||
Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct);
|
||||
}
|
||||
|
||||
@@ -3,10 +3,46 @@ using Rag.Models.Responses;
|
||||
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Core RAG (Retrieval-Augmented Generation) operations: document indexing, vector search, and retrieval.
|
||||
/// </summary>
|
||||
public interface IRagService
|
||||
{
|
||||
/// <summary>
|
||||
/// Indexes a plain-text document by classifying it, chunking the text, generating embeddings,
|
||||
/// and persisting the document and its chunks. Returns cached metadata when the text hash already exists.
|
||||
/// </summary>
|
||||
/// <param name="request">Indexing request with text, optional document type, title, and source URL.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Response with document ID, hash, type, and chunk/character counts.</returns>
|
||||
Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Extracts text from a PDF file, then indexes it the same way as <see cref="IndexTextAsync"/>.
|
||||
/// Returns cached metadata when the extracted text hash already exists.
|
||||
/// </summary>
|
||||
/// <param name="file">Uploaded PDF file (must be ≤ configured max size).</param>
|
||||
/// <param name="documentType">Optional document type hint; if omitted the classifier is used.</param>
|
||||
/// <param name="title">Optional title override; if omitted the title is extracted from the text.</param>
|
||||
/// <param name="sourceUrl">Optional source URL to associate with the document.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Response with document ID, hash, type, and chunk/character counts.</returns>
|
||||
Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Performs a vector similarity search over indexed document chunks, groups results by document,
|
||||
/// and returns the top-K documents with their best-matching chunks.
|
||||
/// </summary>
|
||||
/// <param name="request">Search request with query text, optional document type filter, and top-K limit.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ranked list of matching documents with scored chunk excerpts.</returns>
|
||||
Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Retrieves full document details — including the original text — by document ID.
|
||||
/// </summary>
|
||||
/// <param name="documentId">The document's unique identifier.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Document details, or <c>null</c> if no document with that ID exists.</returns>
|
||||
Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,17 @@
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Splits document text into overlapping chunks suitable for embedding and vector search.
|
||||
/// </summary>
|
||||
public interface ITextChunker
|
||||
{
|
||||
/// <summary>
|
||||
/// Divides <paramref name="text"/> into a list of chunks using a sliding window.
|
||||
/// Adjacent chunks share <paramref name="overlap"/> characters to preserve cross-boundary context.
|
||||
/// </summary>
|
||||
/// <param name="text">The full document text to chunk.</param>
|
||||
/// <param name="chunkSize">Maximum character length per chunk (clamped to 300–3000).</param>
|
||||
/// <param name="overlap">Number of trailing characters from the previous chunk to repeat at the start of the next (clamped to 0–chunkSize/2).</param>
|
||||
/// <returns>Ordered list of non-empty text chunks.</returns>
|
||||
IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,23 @@
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Extracts and normalises plain text from documents.
|
||||
/// </summary>
|
||||
public interface ITextExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Reads all pages of a PDF stream and returns the concatenated, normalised plain text.
|
||||
/// </summary>
|
||||
/// <param name="stream">Readable stream positioned at the start of the PDF file.</param>
|
||||
/// <param name="ct">Cancellation token (checked between pages).</param>
|
||||
/// <returns>Normalised plain text extracted from the PDF.</returns>
|
||||
Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Collapses all whitespace sequences in <paramref name="value"/> to single spaces and trims the result.
|
||||
/// Returns an empty string for null/whitespace input.
|
||||
/// </summary>
|
||||
/// <param name="value">Raw text to normalise.</param>
|
||||
/// <returns>Whitespace-normalised text.</returns>
|
||||
string Normalize(string value);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,9 @@ using Rag.Models;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Classifies documents by type using a keyword-frequency heuristic and extracts a title from the text.
|
||||
/// </summary>
|
||||
public sealed class DocumentClassifier : IDocumentClassifier
|
||||
{
|
||||
private static readonly HashSet<string> KnownTypes = new(StringComparer.OrdinalIgnoreCase)
|
||||
@@ -11,6 +14,7 @@ public sealed class DocumentClassifier : IDocumentClassifier
|
||||
"cv", "job", "article", "contract", "invoice", "product", "documentation", "unknown"
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(providedType))
|
||||
@@ -51,14 +55,20 @@ public sealed class DocumentClassifier : IDocumentClassifier
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>Counts how many of the given <paramref name="terms"/> appear in the lower-cased text.</summary>
|
||||
private static int Count(string lower, params string[] terms) => terms.Count(term => lower.Contains(term));
|
||||
|
||||
/// <summary>Lowercases and replaces non-alphanumeric characters with hyphens to produce a safe type slug.</summary>
|
||||
private static string NormalizeType(string value)
|
||||
{
|
||||
var cleaned = Regex.Replace(value.Trim().ToLowerInvariant(), "[^a-z0-9_-]", "-");
|
||||
return string.IsNullOrWhiteSpace(cleaned) ? "unknown" : cleaned;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns <paramref name="providedTitle"/> when available; otherwise extracts the first sentence-like
|
||||
/// fragment from the text, or falls back to a generic "{type} document" label.
|
||||
/// </summary>
|
||||
private static string BuildTitle(string? providedTitle, string text, string documentType)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(providedTitle)) return providedTitle.Trim();
|
||||
|
||||
@@ -11,6 +11,9 @@ using CommonHelpers;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Implements the core RAG pipeline: document classification, chunking, embedding, vector search, and retrieval.
|
||||
/// </summary>
|
||||
public sealed class RagService : IRagService
|
||||
{
|
||||
private readonly ITextExtractor _textExtractor;
|
||||
@@ -36,6 +39,7 @@ public sealed class RagService : IRagService
|
||||
_settings = options.Value;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct)
|
||||
{
|
||||
var text = _textExtractor.Normalize(request.Text ?? string.Empty);
|
||||
@@ -44,6 +48,7 @@ public sealed class RagService : IRagService
|
||||
return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct)
|
||||
{
|
||||
if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty.");
|
||||
@@ -57,6 +62,7 @@ public sealed class RagService : IRagService
|
||||
return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary<string, string> { ["fileName"] = file.FileName }, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct)
|
||||
{
|
||||
var query = _textExtractor.Normalize(request.QueryText);
|
||||
@@ -97,6 +103,7 @@ public sealed class RagService : IRagService
|
||||
return new SearchResponse { Results = results };
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct)
|
||||
{
|
||||
var document = await _repository.GetDocumentByIdAsync(documentId, ct);
|
||||
@@ -112,6 +119,11 @@ public sealed class RagService : IRagService
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Core indexing pipeline: computes a text hash for deduplication, classifies and chunks the text,
|
||||
/// generates embeddings for each chunk, and persists the document and chunks to the repository.
|
||||
/// Returns cached metadata without re-indexing when the same text hash and source URL already exist.
|
||||
/// </summary>
|
||||
private async Task<IndexDocumentResponse> IndexNormalizedTextAsync(
|
||||
string text,
|
||||
string? documentType,
|
||||
|
||||
@@ -2,8 +2,12 @@ using Api.Services.Contracts;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Splits text into overlapping fixed-size chunks using a sliding window for use in vector embedding pipelines.
|
||||
/// </summary>
|
||||
public sealed class TextChunker : ITextChunker
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return [];
|
||||
|
||||
@@ -4,8 +4,12 @@ using UglyToad.PdfPig;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Extracts and normalises plain text from PDF files using PdfPig.
|
||||
/// </summary>
|
||||
public sealed class TextExtractor : ITextExtractor
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
|
||||
{
|
||||
using var document = PdfDocument.Open(stream);
|
||||
@@ -19,6 +23,7 @@ public sealed class TextExtractor : ITextExtractor
|
||||
return Task.FromResult(Normalize(builder.ToString()));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Normalize(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
|
||||
Reference in New Issue
Block a user