feat(job-search): extract keywords from LLM match call instead of heuristics

Piggybacks keyword extraction onto the existing CV-to-job LLM call —
no extra API calls. The system prompt now instructs the model to return
8-12 English job-search terms (job titles, technologies, skills, domains)
in a new `keywords` field alongside the existing score/summary fields.

Keywords flow: LLM JSON → JobMatchResponse.Keywords → CreateJobSearchTokenRequest →
JobSearchTokenEntity.Keywords (stored comma-separated) → JobSearchSessionEntity.Keywords
(copied at session-creation time, no RAG call needed).

Changes:
- Add Keywords to JobMatchResponse, CreateJobSearchTokenRequest, JobSearchTokenEntity
- IJobTokenService.CreateTokenAsync now accepts IReadOnlyList<string> keywords
- JobTokenService: store keywords on token; TriggerStartAsync reads token.Keywords
  instead of fetching CV text from RAG — removes IRagApiClient dependency
- Remove heuristic ExtractKeywords method
- Migration AddKeywordsToJobSearchTokens: adds Keywords column to cvSearch.JobSearchTokens
- Migration UpdateCvMatchSystemPromptKeywords: updates ai.cv-match.system-prompt seed
  to include keywords in the JSON shape

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 12:44:13 +03:00
parent a467fac35d
commit b78ede23cf
13 changed files with 462 additions and 39 deletions
+1 -1
View File
@@ -181,7 +181,7 @@ public sealed class CvMatcherController : ControllerBase
try
{
var tokenResp = await _jobSearchApi.CreateTokenAsync(
new CreateJobSearchTokenRequest { CvDocumentId = request.CvDocumentId, Email = request.Email, Language = language },
new CreateJobSearchTokenRequest { CvDocumentId = request.CvDocumentId, Email = request.Email, Language = language, Keywords = res.Keywords },
ct);
if (!string.IsNullOrWhiteSpace(tokenResp.TokenId))
{
@@ -5,4 +5,5 @@ public sealed class CreateJobSearchTokenRequest
public string CvDocumentId { get; set; } = string.Empty;
public string Email { get; set; } = string.Empty;
public string Language { get; set; } = "en";
public List<string> Keywords { get; set; } = [];
}
@@ -8,6 +8,7 @@
public List<string> Gaps { get; set; } = [];
public List<string> Recommendations { get; set; } = [];
public List<string> Evidence { get; set; } = [];
public List<string> Keywords { get; set; } = [];
public bool Cached { get; set; }
public string? JobDocumentId { get; set; }
public string? JobUrl { get; set; }
@@ -53,7 +53,7 @@ public sealed class JobSearchController : ControllerBase
if (string.IsNullOrWhiteSpace(request.CvDocumentId) || string.IsNullOrWhiteSpace(request.Email))
return BadRequest(new ErrorResponse { Error = "CvDocumentId and Email are required.", Code = "invalid_request" });
var tokenId = await _tokenService.CreateTokenAsync(request.CvDocumentId, request.Email, request.Language, ct);
var tokenId = await _tokenService.CreateTokenAsync(request.CvDocumentId, request.Email, request.Language, request.Keywords, ct);
return Ok(new CreateJobSearchTokenResponse { TokenId = tokenId });
}
catch (Exception ex)
@@ -12,12 +12,13 @@ public interface IJobTokenService
/// <param name="cvDocumentId">Identifier of the indexed CV document.</param>
/// <param name="email">Email address of the user who will receive the results.</param>
/// <param name="language">Preferred language for result emails (e.g. <c>"en"</c>, <c>"ro"</c>).</param>
/// <param name="keywords">Job search keywords extracted by the LLM during the match call.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>
/// The generated token ID to embed in the one-click job search link,
/// or <c>null</c> when no job providers are currently enabled (link should be suppressed).
/// </returns>
Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, CancellationToken ct);
Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, IReadOnlyList<string> keywords, CancellationToken ct);
/// <summary>
/// Validates the token and, if valid, marks it as used and creates a <c>Pending</c> job search session.
@@ -1,6 +1,4 @@
using System.Text.Json;
using System.Text.RegularExpressions;
using Api.Clients.Api.Contracts;
using Api.Services.Contracts;
using CvMatcher.Models.Responses;
using CvSearch.Data;
@@ -16,28 +14,27 @@ namespace Api.Services;
/// Provider configuration is read from <c>cvSearch.JobProviders</c> at session-creation time and
/// snapshotted into <c>JobSearchSessionEntity.ProviderConfigJson</c> so subsequent config changes
/// do not affect already-queued sessions.
/// Keywords are extracted by the LLM during the CV-to-job match call and stored on the token,
/// then copied to the session when the user clicks the link — no extra RAG call needed.
/// </summary>
public sealed class JobTokenService : IJobTokenService
{
private readonly CvSearchDbContext _db;
private readonly IRagApiClient _rag;
private readonly JobSearchSettings _settings;
private readonly ILogger<JobTokenService> _logger;
public JobTokenService(
CvSearchDbContext db,
IRagApiClient rag,
IOptions<JobSearchSettings> settings,
ILogger<JobTokenService> logger)
{
_db = db;
_rag = rag;
_settings = settings.Value;
_logger = logger;
}
/// <inheritdoc />
public async Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, CancellationToken ct)
public async Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, IReadOnlyList<string> keywords, CancellationToken ct)
{
var hasEnabledProviders = await _db.JobProviders.AnyAsync(p => p.Enabled, ct);
if (!hasEnabledProviders)
@@ -52,6 +49,7 @@ public sealed class JobTokenService : IJobTokenService
CvDocumentId = cvDocumentId,
Email = email,
Language = language,
Keywords = string.Join(",", keywords),
ExpiresAt = DateTime.UtcNow.AddDays(_settings.TokenExpiryDays),
Used = false,
CreatedAt = DateTime.UtcNow
@@ -59,7 +57,7 @@ public sealed class JobTokenService : IJobTokenService
_db.JobSearchTokens.Add(token);
await _db.SaveChangesAsync(ct);
_logger.LogInformation("Job search token created. TokenId={TokenId}, CvDocumentId={CvDocumentId}", token.Id, cvDocumentId);
_logger.LogInformation("Job search token created. TokenId={TokenId}, CvDocumentId={CvDocumentId}, Keywords={Keywords}", token.Id, cvDocumentId, token.Keywords);
return token.Id;
}
@@ -74,8 +72,7 @@ public sealed class JobTokenService : IJobTokenService
token.Used = true;
await _db.SaveChangesAsync(ct);
var cv = await _rag.GetDocumentAsync(token.CvDocumentId, ct);
var keywords = cv is not null ? ExtractKeywords(cv.Text) : string.Empty;
var keywords = token.Keywords;
var enabledProviders = await _db.JobProviders
.Where(p => p.Enabled)
@@ -108,10 +105,6 @@ public sealed class JobTokenService : IJobTokenService
return StartJobSearchStatus.Started;
}
/// <summary>
/// Maps a <see cref="JobProviderEntity"/> to the <see cref="JobProviderConfig"/> DTO used by
/// <c>cv-search-job</c>. The <c>InitialKeywords</c> list is stored as a JSON array in the entity.
/// </summary>
private static JobProviderConfig ToConfig(JobProviderEntity entity)
{
List<string> keywords;
@@ -136,27 +129,4 @@ public sealed class JobTokenService : IJobTokenService
};
}
/// <summary>
/// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM).
/// Samples the first 2000 characters (where title/role/skills usually appear), splits by
/// whitespace and common delimiters, strips punctuation, and deduplicates.
/// Works regardless of whether the PDF extractor preserves newlines.
/// </summary>
private static string ExtractKeywords(string cvText)
{
// Focus on the header area where name/title/skills typically appear
var sample = cvText.Length > 2000 ? cvText[..2000] : cvText;
var words = sample
.Split([' ', '\n', '\r', '\t', '|', '/', ',', ';', '(', ')'], StringSplitOptions.RemoveEmptyEntries)
.Select(w => Regex.Replace(w, @"[^\w\-]", "").Trim('-'))
.Where(w => w.Length > 2)
.Where(w => !Regex.IsMatch(w, @"^[\d\-]+$")) // skip phone fragments and pure numbers
.Where(w => !w.Contains('@') && !w.Contains('.')) // skip emails and URLs
.Distinct(StringComparer.OrdinalIgnoreCase)
.Take(10)
.ToList();
return string.Join(",", words);
}
}
@@ -0,0 +1,130 @@
// <auto-generated />
using System;
using CvMatcher.Data;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Metadata;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
#nullable disable
namespace CvMatcher.Data.Migrations
{
[DbContext(typeof(CvMatcherDbContext))]
[Migration("20260529140000_UpdateCvMatchSystemPromptKeywords")]
partial class UpdateCvMatchSystemPromptKeywords
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder
.HasDefaultSchema("cvMatcher")
.HasAnnotation("ProductVersion", "10.0.7")
.HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("CvMatcher.Data.Entities.AiPromptEntity", b =>
{
b.Property<string>("Key")
.HasMaxLength(128)
.HasColumnType("nvarchar(128)");
b.Property<string>("Language")
.HasMaxLength(8)
.HasColumnType("nvarchar(8)");
b.Property<string>("Description")
.IsRequired()
.ValueGeneratedOnAdd()
.HasMaxLength(500)
.HasColumnType("nvarchar(500)")
.HasDefaultValue("");
b.Property<DateTime>("UpdatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<string>("Value")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.HasKey("Key", "Language");
b.ToTable("AiPrompts", "cvMatcher");
});
modelBuilder.Entity("CvMatcher.Data.Entities.CvMatchResultEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<string>("CvDocumentId")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("JobDocumentId")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("Language")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<string>("ResultJson")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<int>("Score")
.HasColumnType("int");
b.HasKey("Id");
b.HasIndex("CvDocumentId", "JobDocumentId")
.IsUnique();
b.ToTable("Results", "cvMatcher");
});
modelBuilder.Entity("CvMatcher.Data.Entities.CvMatcherChatCacheEntity", b =>
{
b.Property<string>("CacheKey")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<string>("Model")
.IsRequired()
.HasMaxLength(120)
.HasColumnType("nvarchar(120)");
b.Property<string>("ResponseText")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<decimal>("Temperature")
.HasColumnType("decimal(4,2)");
b.HasKey("CacheKey");
b.ToTable("ChatCache", "cvMatcher");
});
#pragma warning restore 612, 618
}
}
}
@@ -0,0 +1,49 @@
using Microsoft.EntityFrameworkCore.Migrations;
using CvMatcher.Data;
#nullable disable
namespace CvMatcher.Data.Migrations
{
/// <inheritdoc />
public partial class UpdateCvMatchSystemPromptKeywords : Migration
{
private const string OldPrompt =
"You are a strict CV-to-job matching engine. Return JSON only. Score realistically from 0 to 100.\n" +
"Penalize missing required skills. Do not invent experience. Use concise business language.\n" +
"Respond entirely in {{languageName}} — all text fields in the JSON must be in {{languageName}}.\n" +
"JSON shape: {\"score\":number,\"summary\":\"...\",\"strengths\":[\"...\"],\"gaps\":[\"...\"],\"recommendations\":[\"...\"],\"evidence\":[\"...\"]}";
private const string NewPrompt =
"You are a strict CV-to-job matching engine. Return JSON only. Score realistically from 0 to 100.\n" +
"Penalize missing required skills. Do not invent experience. Use concise business language.\n" +
"Respond entirely in {{languageName}} — all text fields in the JSON must be in {{languageName}}.\n" +
"Also extract 8 to 12 English job search keywords from the CV — job titles, technologies, skills, and domains.\n" +
"The keywords array must always be in English regardless of {{languageName}}. Exclude names, emails, phone numbers, and locations.\n" +
"JSON shape: {\"score\":number,\"summary\":\"...\",\"strengths\":[\"...\"],\"gaps\":[\"...\"],\"recommendations\":[\"...\"],\"evidence\":[\"...\"],\"keywords\":[\"term1\",\"term2\"]}";
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.UpdateData(
schema: MigrationConstants.SchemaName,
table: "AiPrompts",
keyColumns: ["Key", "Language"],
keyValues: new object[] { "ai.cv-match.system-prompt", "*" },
column: "Value",
value: NewPrompt);
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.UpdateData(
schema: MigrationConstants.SchemaName,
table: "AiPrompts",
keyColumns: ["Key", "Language"],
keyValues: new object[] { "ai.cv-match.system-prompt", "*" },
column: "Value",
value: OldPrompt);
}
}
}
@@ -34,6 +34,7 @@ public sealed class CvSearchDbContext : DbContext
entity.Property(x => x.CvDocumentId).HasMaxLength(64).IsRequired();
entity.Property(x => x.Email).HasMaxLength(256).IsRequired();
entity.Property(x => x.Language).HasMaxLength(8).HasDefaultValue("en").IsRequired();
entity.Property(x => x.Keywords).HasMaxLength(1000).HasDefaultValue(string.Empty);
entity.Property(x => x.Used).HasDefaultValue(false);
entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()");
});
@@ -9,4 +9,5 @@ public sealed class JobSearchTokenEntity : BaseEntity
public string Language { get; set; } = "en";
public DateTime ExpiresAt { get; set; }
public bool Used { get; set; }
public string Keywords { get; set; } = string.Empty;
}
@@ -0,0 +1,229 @@
// <auto-generated />
using System;
using CvSearch.Data;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Metadata;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
#nullable disable
namespace CvSearch.Data.Migrations
{
[DbContext(typeof(CvSearchDbContext))]
[Migration("20260529130000_AddKeywordsToJobSearchTokens")]
partial class AddKeywordsToJobSearchTokens
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder
.HasDefaultSchema("cvSearch")
.HasAnnotation("ProductVersion", "10.0.7")
.HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("CvSearch.Data.Entities.JobProviderEntity", b =>
{
b.Property<int>("Id")
.ValueGeneratedOnAdd()
.HasColumnType("int");
SqlServerPropertyBuilderExtensions.UseIdentityColumn(b.Property<int>("Id"));
b.Property<int>("DisplayOrder")
.ValueGeneratedOnAdd()
.HasColumnType("int")
.HasDefaultValue(0);
b.Property<bool>("Enabled")
.HasColumnType("bit");
b.Property<string>("InitialKeywordsJson")
.IsRequired()
.ValueGeneratedOnAdd()
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)")
.HasDefaultValue("[]");
b.Property<string>("JobLinkContains")
.IsRequired()
.HasMaxLength(256)
.HasColumnType("nvarchar(256)");
b.Property<int>("MaxResults")
.ValueGeneratedOnAdd()
.HasColumnType("int")
.HasDefaultValue(20);
b.Property<string>("Name")
.IsRequired()
.HasMaxLength(128)
.HasColumnType("nvarchar(128)");
b.Property<string>("SearchUrlTemplate")
.IsRequired()
.HasMaxLength(1024)
.HasColumnType("nvarchar(1024)");
b.HasKey("Id");
b.ToTable("JobProviders", "cvSearch");
});
modelBuilder.Entity("CvSearch.Data.Entities.JobSearchResultEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<string>("JobText")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<string>("JobTitle")
.IsRequired()
.HasMaxLength(512)
.HasColumnType("nvarchar(512)");
b.Property<string>("JobUrl")
.IsRequired()
.HasMaxLength(2048)
.HasColumnType("nvarchar(2048)");
b.Property<string>("ProviderName")
.IsRequired()
.HasMaxLength(128)
.HasColumnType("nvarchar(128)");
b.Property<string>("ResultJson")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<int>("Score")
.HasColumnType("int");
b.Property<string>("SessionId")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.HasKey("Id");
b.HasIndex("SessionId");
b.ToTable("JobSearchResults", "cvSearch");
});
modelBuilder.Entity("CvSearch.Data.Entities.JobSearchSessionEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<string>("CvDocumentId")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("Email")
.IsRequired()
.HasMaxLength(256)
.HasColumnType("nvarchar(256)");
b.Property<string>("Keywords")
.IsRequired()
.HasMaxLength(1000)
.HasColumnType("nvarchar(1000)");
b.Property<string>("Language")
.IsRequired()
.ValueGeneratedOnAdd()
.HasMaxLength(8)
.HasColumnType("nvarchar(8)")
.HasDefaultValue("en");
b.Property<string>("ProviderConfigJson")
.HasColumnType("nvarchar(max)");
b.Property<string>("Status")
.IsRequired()
.HasMaxLength(32)
.HasColumnType("nvarchar(32)");
b.Property<string>("TokenId")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.HasKey("Id");
b.HasIndex("Status");
b.ToTable("JobSearchSessions", "cvSearch");
});
modelBuilder.Entity("CvSearch.Data.Entities.JobSearchTokenEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<string>("CvDocumentId")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("Email")
.IsRequired()
.HasMaxLength(256)
.HasColumnType("nvarchar(256)");
b.Property<DateTime>("ExpiresAt")
.HasColumnType("datetime2");
b.Property<string>("Keywords")
.IsRequired()
.ValueGeneratedOnAdd()
.HasMaxLength(1000)
.HasColumnType("nvarchar(1000)")
.HasDefaultValue("");
b.Property<string>("Language")
.IsRequired()
.ValueGeneratedOnAdd()
.HasMaxLength(8)
.HasColumnType("nvarchar(8)")
.HasDefaultValue("en");
b.Property<bool>("Used")
.ValueGeneratedOnAdd()
.HasColumnType("bit")
.HasDefaultValue(false);
b.HasKey("Id");
b.ToTable("JobSearchTokens", "cvSearch");
});
#pragma warning restore 612, 618
}
}
}
@@ -0,0 +1,33 @@
using Microsoft.EntityFrameworkCore.Migrations;
using CvSearch.Data;
#nullable disable
namespace CvSearch.Data.Migrations
{
/// <inheritdoc />
public partial class AddKeywordsToJobSearchTokens : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.AddColumn<string>(
name: "Keywords",
schema: MigrationConstants.SchemaName,
table: "JobSearchTokens",
type: "nvarchar(1000)",
maxLength: 1000,
nullable: false,
defaultValue: "");
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropColumn(
name: "Keywords",
schema: MigrationConstants.SchemaName,
table: "JobSearchTokens");
}
}
}
@@ -197,6 +197,13 @@ namespace CvSearch.Data.Migrations
b.Property<DateTime>("ExpiresAt")
.HasColumnType("datetime2");
b.Property<string>("Keywords")
.IsRequired()
.ValueGeneratedOnAdd()
.HasMaxLength(1000)
.HasColumnType("nvarchar(1000)")
.HasDefaultValue("");
b.Property<string>("Language")
.IsRequired()
.ValueGeneratedOnAdd()