4ee4a59b5e
- EmailController: add class summary, full SwaggerResponse/ProducesResponseType for 400 and 500, and Description on SwaggerOperation - ContactController: fix terse "Failed." error message to "Could not process subscription." - FileDownloadController: remove redundant XML <response code> tags from the public action doc block; convert private-method /// <summary> to // (project convention: no XML doc on internal code) - CvMatcherService: remove two dead commented-out blocks (old email send and BuildEmailBody helper) - JobTokenService: comment the phone/contact-line regex filter in ExtractKeywords - DocumentClassifier: comment the keyword-frequency scoring approach and the confidence formula - TextChunker: comment the sliding-window step (chunkSize - overlap) - CvSearchJobTask: comment the GdprConsent = true rationale and the BuildCvFileName sanitisation logic - HtmlJobSearcher: comment GetLeftPart(UriPartial.Path) query-strip dedup Closes #26 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
70 lines
3.4 KiB
C#
70 lines
3.4 KiB
C#
using System.Text.RegularExpressions;
|
|
using Api.Services.Contracts;
|
|
using Rag.Models;
|
|
|
|
namespace Api.Services;
|
|
|
|
public sealed class DocumentClassifier : IDocumentClassifier
|
|
{
|
|
private static readonly HashSet<string> KnownTypes = new(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
"cv", "job", "article", "contract", "invoice", "product", "documentation", "unknown"
|
|
};
|
|
|
|
public Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(providedType))
|
|
{
|
|
var normalized = NormalizeType(providedType);
|
|
return Task.FromResult(new DocumentClassification
|
|
{
|
|
DocumentType = normalized,
|
|
Confidence = KnownTypes.Contains(normalized) && normalized != "unknown" ? 1.0 : 0.6,
|
|
Title = BuildTitle(providedTitle, text, normalized)
|
|
});
|
|
}
|
|
|
|
// Keyword-frequency heuristic: count how many characteristic terms each document
|
|
// type contributes to the text, then pick the type with the highest hit count.
|
|
var lower = text.ToLowerInvariant();
|
|
var scores = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
["cv"] = Count(lower, "curriculum vitae", "resume", "work experience", "professional experience", "education", "skills", "technologies", "linkedin", "github"),
|
|
["job"] = Count(lower, "job description", "requirements", "responsibilities", "qualifications", "apply", "we are looking", "salary", "benefits", "remote", "hybrid"),
|
|
["contract"] = Count(lower, "agreement", "contract", "party", "parties", "liability", "termination", "confidentiality", "governing law"),
|
|
["invoice"] = Count(lower, "invoice", "vat", "subtotal", "total", "amount due", "due date", "billing"),
|
|
["documentation"] = Count(lower, "api", "endpoint", "configuration", "install", "usage", "parameters", "response", "request"),
|
|
["product"] = Count(lower, "features", "pricing", "sku", "product", "specification", "warranty")
|
|
};
|
|
|
|
var best = scores.OrderByDescending(x => x.Value).First();
|
|
var type = best.Value <= 0 ? "unknown" : best.Key;
|
|
// Confidence baseline 0.45 + 0.08 per matched keyword term, capped at 0.95.
|
|
// Zero hits → 0.25 (effectively unknown).
|
|
var confidence = best.Value <= 0 ? 0.25 : Math.Min(0.95, 0.45 + best.Value * 0.08);
|
|
|
|
return Task.FromResult(new DocumentClassification
|
|
{
|
|
DocumentType = type,
|
|
Confidence = confidence,
|
|
Title = BuildTitle(providedTitle, text, type)
|
|
});
|
|
}
|
|
|
|
private static int Count(string lower, params string[] terms) => terms.Count(term => lower.Contains(term));
|
|
|
|
private static string NormalizeType(string value)
|
|
{
|
|
var cleaned = Regex.Replace(value.Trim().ToLowerInvariant(), "[^a-z0-9_-]", "-");
|
|
return string.IsNullOrWhiteSpace(cleaned) ? "unknown" : cleaned;
|
|
}
|
|
|
|
private static string BuildTitle(string? providedTitle, string text, string documentType)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(providedTitle)) return providedTitle.Trim();
|
|
var firstLine = text.Split('.', '\n', '\r').Select(x => x.Trim()).FirstOrDefault(x => x.Length > 20);
|
|
if (!string.IsNullOrWhiteSpace(firstLine)) return firstLine.Length <= 120 ? firstLine : firstLine[..120];
|
|
return $"{documentType} document";
|
|
}
|
|
}
|