using System.Text.RegularExpressions; using Api.Services.Contracts; using Rag.Models; namespace Api.Services; /// /// Classifies documents by type using a keyword-frequency heuristic and extracts a title from the text. /// public sealed class DocumentClassifier : IDocumentClassifier { private static readonly HashSet KnownTypes = new(StringComparer.OrdinalIgnoreCase) { "cv", "job", "article", "contract", "invoice", "product", "documentation", "unknown" }; /// public Task ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct) { if (!string.IsNullOrWhiteSpace(providedType)) { var normalized = NormalizeType(providedType); return Task.FromResult(new DocumentClassification { DocumentType = normalized, Confidence = KnownTypes.Contains(normalized) && normalized != "unknown" ? 1.0 : 0.6, Title = BuildTitle(providedTitle, text, normalized) }); } // Keyword-frequency heuristic: count how many characteristic terms each document // type contributes to the text, then pick the type with the highest hit count. var lower = text.ToLowerInvariant(); var scores = new Dictionary(StringComparer.OrdinalIgnoreCase) { ["cv"] = Count(lower, "curriculum vitae", "resume", "work experience", "professional experience", "education", "skills", "technologies", "linkedin", "github"), ["job"] = Count(lower, "job description", "requirements", "responsibilities", "qualifications", "apply", "we are looking", "salary", "benefits", "remote", "hybrid"), ["contract"] = Count(lower, "agreement", "contract", "party", "parties", "liability", "termination", "confidentiality", "governing law"), ["invoice"] = Count(lower, "invoice", "vat", "subtotal", "total", "amount due", "due date", "billing"), ["documentation"] = Count(lower, "api", "endpoint", "configuration", "install", "usage", "parameters", "response", "request"), ["product"] = Count(lower, "features", "pricing", "sku", "product", "specification", "warranty") }; var best = scores.OrderByDescending(x => x.Value).First(); var type = best.Value <= 0 ? "unknown" : best.Key; // Confidence baseline 0.45 + 0.08 per matched keyword term, capped at 0.95. // Zero hits → 0.25 (effectively unknown). var confidence = best.Value <= 0 ? 0.25 : Math.Min(0.95, 0.45 + best.Value * 0.08); return Task.FromResult(new DocumentClassification { DocumentType = type, Confidence = confidence, Title = BuildTitle(providedTitle, text, type) }); } /// Counts how many of the given appear in the lower-cased text. private static int Count(string lower, params string[] terms) => terms.Count(term => lower.Contains(term)); /// Lowercases and replaces non-alphanumeric characters with hyphens to produce a safe type slug. private static string NormalizeType(string value) { var cleaned = Regex.Replace(value.Trim().ToLowerInvariant(), "[^a-z0-9_-]", "-"); return string.IsNullOrWhiteSpace(cleaned) ? "unknown" : cleaned; } /// /// Returns when available; otherwise extracts the first sentence-like /// fragment from the text, or falls back to a generic "{type} document" label. /// private static string BuildTitle(string? providedTitle, string text, string documentType) { if (!string.IsNullOrWhiteSpace(providedTitle)) return providedTitle.Trim(); var firstLine = text.Split('.', '\n', '\r').Select(x => x.Trim()).FirstOrDefault(x => x.Length > 20); if (!string.IsNullOrWhiteSpace(firstLine)) return firstLine.Length <= 120 ? firstLine : firstLine[..120]; return $"{documentType} document"; } }