fix(cv-matcher-api): fix keyword extraction for single-line PDF text
PDF text extraction often stores all content without newlines. The previous line-based splitter would produce one line > 200 chars which was filtered out, yielding empty keywords. Replace with word-level sampling of the first 2000 chars, splitting on whitespace and common delimiters, skipping phone fragments, emails, and URLs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -138,23 +138,21 @@ public sealed class JobTokenService : IJobTokenService
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM).
|
/// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM).
|
||||||
/// Takes the first 5 usable lines, splits them into words, strips punctuation, and deduplicates.
|
/// Samples the first 2000 characters (where title/role/skills usually appear), splits by
|
||||||
|
/// whitespace and common delimiters, strips punctuation, and deduplicates.
|
||||||
|
/// Works regardless of whether the PDF extractor preserves newlines.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private static string ExtractKeywords(string cvText)
|
private static string ExtractKeywords(string cvText)
|
||||||
{
|
{
|
||||||
var lines = cvText
|
// Focus on the header area where name/title/skills typically appear
|
||||||
.Split(['\n', '\r'], StringSplitOptions.RemoveEmptyEntries)
|
var sample = cvText.Length > 2000 ? cvText[..2000] : cvText;
|
||||||
.Select(l => l.Trim())
|
|
||||||
.Where(l => l.Length > 5 && l.Length < 200)
|
|
||||||
// Skip lines that are purely digits, spaces, and phone/contact punctuation (phone numbers, emails, etc.)
|
|
||||||
.Where(l => !Regex.IsMatch(l, @"^[\d\s\+\-\(\)\@\.]+$"))
|
|
||||||
.Take(5)
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
var words = lines
|
var words = sample
|
||||||
.SelectMany(l => l.Split(' ', StringSplitOptions.RemoveEmptyEntries))
|
.Split([' ', '\n', '\r', '\t', '|', '/', ',', ';', '(', ')'], StringSplitOptions.RemoveEmptyEntries)
|
||||||
.Select(w => Regex.Replace(w, @"[^\w\-]", ""))
|
.Select(w => Regex.Replace(w, @"[^\w\-]", "").Trim('-'))
|
||||||
.Where(w => w.Length > 2)
|
.Where(w => w.Length > 2)
|
||||||
|
.Where(w => !Regex.IsMatch(w, @"^[\d\-]+$")) // skip phone fragments and pure numbers
|
||||||
|
.Where(w => !w.Contains('@') && !w.Contains('.')) // skip emails and URLs
|
||||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||||
.Take(10)
|
.Take(10)
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|||||||
Reference in New Issue
Block a user