From a467fac35d56c5ec8be51ce038851037ef0725f4 Mon Sep 17 00:00:00 2001
From: claude <gelu@mihes.ro>
Date: Fri, 29 May 2026 12:29:10 +0300
Subject: [PATCH] fix(cv-matcher-api): fix keyword extraction for single-line
 PDF text

PDF text extraction often stores all content without newlines. The previous
line-based splitter would produce one line > 200 chars which was filtered out,
yielding empty keywords. Replace with word-level sampling of the first 2000
chars, splitting on whitespace and common delimiters, skipping phone fragments,
emails, and URLs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../Services/JobTokenService.cs               | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)
diff --git a/Apis/cv-matcher-api/Services/JobTokenService.cs b/Apis/cv-matcher-api/Services/JobTokenService.cs
index 37a1e36..b565071 100644
--- a/Apis/cv-matcher-api/Services/JobTokenService.cs
+++ b/Apis/cv-matcher-api/Services/JobTokenService.cs
@@ -138,23 +138,21 @@ public sealed class JobTokenService : IJobTokenService
 
     /// <summary>
     /// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM).
-    /// Takes the first 5 usable lines, splits them into words, strips punctuation, and deduplicates.
+    /// Samples the first 2000 characters (where title/role/skills usually appear), splits by
+    /// whitespace and common delimiters, strips punctuation, and deduplicates.
+    /// Works regardless of whether the PDF extractor preserves newlines.
     /// </summary>
     private static string ExtractKeywords(string cvText)
     {
-        var lines = cvText
-            .Split(['\n', '\r'], StringSplitOptions.RemoveEmptyEntries)
-            .Select(l => l.Trim())
-            .Where(l => l.Length > 5 && l.Length < 200)
-            // Skip lines that are purely digits, spaces, and phone/contact punctuation (phone numbers, emails, etc.)
-            .Where(l => !Regex.IsMatch(l, @"^[\d\s\+\-\(\)\@\.]+$"))
-            .Take(5)
-            .ToList();
+        // Focus on the header area where name/title/skills typically appear
+        var sample = cvText.Length > 2000 ? cvText[..2000] : cvText;
 
-        var words = lines
-            .SelectMany(l => l.Split(' ', StringSplitOptions.RemoveEmptyEntries))
-            .Select(w => Regex.Replace(w, @"[^\w\-]", ""))
+        var words = sample
+            .Split([' ', '\n', '\r', '\t', '|', '/', ',', ';', '(', ')'], StringSplitOptions.RemoveEmptyEntries)
+            .Select(w => Regex.Replace(w, @"[^\w\-]", "").Trim('-'))
             .Where(w => w.Length > 2)
+            .Where(w => !Regex.IsMatch(w, @"^[\d\-]+$"))  // skip phone fragments and pure numbers
+            .Where(w => !w.Contains('@') && !w.Contains('.'))  // skip emails and URLs
             .Distinct(StringComparer.OrdinalIgnoreCase)
             .Take(10)
             .ToList();