feat(job-search): extract keywords from LLM match call instead of heuristics
Piggybacks keyword extraction onto the existing CV-to-job LLM call — no extra API calls. The system prompt now instructs the model to return 8-12 English job-search terms (job titles, technologies, skills, domains) in a new `keywords` field alongside the existing score/summary fields. Keywords flow: LLM JSON → JobMatchResponse.Keywords → CreateJobSearchTokenRequest → JobSearchTokenEntity.Keywords (stored comma-separated) → JobSearchSessionEntity.Keywords (copied at session-creation time, no RAG call needed). Changes: - Add Keywords to JobMatchResponse, CreateJobSearchTokenRequest, JobSearchTokenEntity - IJobTokenService.CreateTokenAsync now accepts IReadOnlyList<string> keywords - JobTokenService: store keywords on token; TriggerStartAsync reads token.Keywords instead of fetching CV text from RAG — removes IRagApiClient dependency - Remove heuristic ExtractKeywords method - Migration AddKeywordsToJobSearchTokens: adds Keywords column to cvSearch.JobSearchTokens - Migration UpdateCvMatchSystemPromptKeywords: updates ai.cv-match.system-prompt seed to include keywords in the JSON shape Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,12 +12,13 @@ public interface IJobTokenService
|
||||
/// <param name="cvDocumentId">Identifier of the indexed CV document.</param>
|
||||
/// <param name="email">Email address of the user who will receive the results.</param>
|
||||
/// <param name="language">Preferred language for result emails (e.g. <c>"en"</c>, <c>"ro"</c>).</param>
|
||||
/// <param name="keywords">Job search keywords extracted by the LLM during the match call.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>
|
||||
/// The generated token ID to embed in the one-click job search link,
|
||||
/// or <c>null</c> when no job providers are currently enabled (link should be suppressed).
|
||||
/// </returns>
|
||||
Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, CancellationToken ct);
|
||||
Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, IReadOnlyList<string> keywords, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Validates the token and, if valid, marks it as used and creates a <c>Pending</c> job search session.
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.RegularExpressions;
|
||||
using Api.Clients.Api.Contracts;
|
||||
using Api.Services.Contracts;
|
||||
using CvMatcher.Models.Responses;
|
||||
using CvSearch.Data;
|
||||
@@ -16,28 +14,27 @@ namespace Api.Services;
|
||||
/// Provider configuration is read from <c>cvSearch.JobProviders</c> at session-creation time and
|
||||
/// snapshotted into <c>JobSearchSessionEntity.ProviderConfigJson</c> so subsequent config changes
|
||||
/// do not affect already-queued sessions.
|
||||
/// Keywords are extracted by the LLM during the CV-to-job match call and stored on the token,
|
||||
/// then copied to the session when the user clicks the link — no extra RAG call needed.
|
||||
/// </summary>
|
||||
public sealed class JobTokenService : IJobTokenService
|
||||
{
|
||||
private readonly CvSearchDbContext _db;
|
||||
private readonly IRagApiClient _rag;
|
||||
private readonly JobSearchSettings _settings;
|
||||
private readonly ILogger<JobTokenService> _logger;
|
||||
|
||||
public JobTokenService(
|
||||
CvSearchDbContext db,
|
||||
IRagApiClient rag,
|
||||
IOptions<JobSearchSettings> settings,
|
||||
ILogger<JobTokenService> logger)
|
||||
{
|
||||
_db = db;
|
||||
_rag = rag;
|
||||
_settings = settings.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, CancellationToken ct)
|
||||
public async Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, IReadOnlyList<string> keywords, CancellationToken ct)
|
||||
{
|
||||
var hasEnabledProviders = await _db.JobProviders.AnyAsync(p => p.Enabled, ct);
|
||||
if (!hasEnabledProviders)
|
||||
@@ -52,6 +49,7 @@ public sealed class JobTokenService : IJobTokenService
|
||||
CvDocumentId = cvDocumentId,
|
||||
Email = email,
|
||||
Language = language,
|
||||
Keywords = string.Join(",", keywords),
|
||||
ExpiresAt = DateTime.UtcNow.AddDays(_settings.TokenExpiryDays),
|
||||
Used = false,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
@@ -59,7 +57,7 @@ public sealed class JobTokenService : IJobTokenService
|
||||
|
||||
_db.JobSearchTokens.Add(token);
|
||||
await _db.SaveChangesAsync(ct);
|
||||
_logger.LogInformation("Job search token created. TokenId={TokenId}, CvDocumentId={CvDocumentId}", token.Id, cvDocumentId);
|
||||
_logger.LogInformation("Job search token created. TokenId={TokenId}, CvDocumentId={CvDocumentId}, Keywords={Keywords}", token.Id, cvDocumentId, token.Keywords);
|
||||
return token.Id;
|
||||
}
|
||||
|
||||
@@ -74,8 +72,7 @@ public sealed class JobTokenService : IJobTokenService
|
||||
token.Used = true;
|
||||
await _db.SaveChangesAsync(ct);
|
||||
|
||||
var cv = await _rag.GetDocumentAsync(token.CvDocumentId, ct);
|
||||
var keywords = cv is not null ? ExtractKeywords(cv.Text) : string.Empty;
|
||||
var keywords = token.Keywords;
|
||||
|
||||
var enabledProviders = await _db.JobProviders
|
||||
.Where(p => p.Enabled)
|
||||
@@ -108,10 +105,6 @@ public sealed class JobTokenService : IJobTokenService
|
||||
return StartJobSearchStatus.Started;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maps a <see cref="JobProviderEntity"/> to the <see cref="JobProviderConfig"/> DTO used by
|
||||
/// <c>cv-search-job</c>. The <c>InitialKeywords</c> list is stored as a JSON array in the entity.
|
||||
/// </summary>
|
||||
private static JobProviderConfig ToConfig(JobProviderEntity entity)
|
||||
{
|
||||
List<string> keywords;
|
||||
@@ -136,27 +129,4 @@ public sealed class JobTokenService : IJobTokenService
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM).
|
||||
/// Samples the first 2000 characters (where title/role/skills usually appear), splits by
|
||||
/// whitespace and common delimiters, strips punctuation, and deduplicates.
|
||||
/// Works regardless of whether the PDF extractor preserves newlines.
|
||||
/// </summary>
|
||||
private static string ExtractKeywords(string cvText)
|
||||
{
|
||||
// Focus on the header area where name/title/skills typically appear
|
||||
var sample = cvText.Length > 2000 ? cvText[..2000] : cvText;
|
||||
|
||||
var words = sample
|
||||
.Split([' ', '\n', '\r', '\t', '|', '/', ',', ';', '(', ')'], StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(w => Regex.Replace(w, @"[^\w\-]", "").Trim('-'))
|
||||
.Where(w => w.Length > 2)
|
||||
.Where(w => !Regex.IsMatch(w, @"^[\d\-]+$")) // skip phone fragments and pure numbers
|
||||
.Where(w => !w.Contains('@') && !w.Contains('.')) // skip emails and URLs
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.Take(10)
|
||||
.ToList();
|
||||
|
||||
return string.Join(",", words);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user