From a467fac35d56c5ec8be51ce038851037ef0725f4 Mon Sep 17 00:00:00 2001 From: claude Date: Fri, 29 May 2026 12:29:10 +0300 Subject: [PATCH] fix(cv-matcher-api): fix keyword extraction for single-line PDF text PDF text extraction often stores all content without newlines. The previous line-based splitter would produce one line > 200 chars which was filtered out, yielding empty keywords. Replace with word-level sampling of the first 2000 chars, splitting on whitespace and common delimiters, skipping phone fragments, emails, and URLs. Co-Authored-By: Claude Sonnet 4.6 --- .../Services/JobTokenService.cs | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/Apis/cv-matcher-api/Services/JobTokenService.cs b/Apis/cv-matcher-api/Services/JobTokenService.cs index 37a1e36..b565071 100644 --- a/Apis/cv-matcher-api/Services/JobTokenService.cs +++ b/Apis/cv-matcher-api/Services/JobTokenService.cs @@ -138,23 +138,21 @@ public sealed class JobTokenService : IJobTokenService /// /// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM). - /// Takes the first 5 usable lines, splits them into words, strips punctuation, and deduplicates. + /// Samples the first 2000 characters (where title/role/skills usually appear), splits by + /// whitespace and common delimiters, strips punctuation, and deduplicates. + /// Works regardless of whether the PDF extractor preserves newlines. /// private static string ExtractKeywords(string cvText) { - var lines = cvText - .Split(['\n', '\r'], StringSplitOptions.RemoveEmptyEntries) - .Select(l => l.Trim()) - .Where(l => l.Length > 5 && l.Length < 200) - // Skip lines that are purely digits, spaces, and phone/contact punctuation (phone numbers, emails, etc.) - .Where(l => !Regex.IsMatch(l, @"^[\d\s\+\-\(\)\@\.]+$")) - .Take(5) - .ToList(); + // Focus on the header area where name/title/skills typically appear + var sample = cvText.Length > 2000 ? cvText[..2000] : cvText; - var words = lines - .SelectMany(l => l.Split(' ', StringSplitOptions.RemoveEmptyEntries)) - .Select(w => Regex.Replace(w, @"[^\w\-]", "")) + var words = sample + .Split([' ', '\n', '\r', '\t', '|', '/', ',', ';', '(', ')'], StringSplitOptions.RemoveEmptyEntries) + .Select(w => Regex.Replace(w, @"[^\w\-]", "").Trim('-')) .Where(w => w.Length > 2) + .Where(w => !Regex.IsMatch(w, @"^[\d\-]+$")) // skip phone fragments and pure numbers + .Where(w => !w.Contains('@') && !w.Contains('.')) // skip emails and URLs .Distinct(StringComparer.OrdinalIgnoreCase) .Take(10) .ToList();