a467fac35d
PDF text extraction often stores all content without newlines. The previous line-based splitter would produce one line > 200 chars which was filtered out, yielding empty keywords. Replace with word-level sampling of the first 2000 chars, splitting on whitespace and common delimiters, skipping phone fragments, emails, and URLs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
163 lines
6.0 KiB
C#
163 lines
6.0 KiB
C#
using System.Text.Json;
|
|
using System.Text.RegularExpressions;
|
|
using Api.Clients.Api.Contracts;
|
|
using Api.Services.Contracts;
|
|
using CvMatcher.Models.Responses;
|
|
using CvSearch.Data;
|
|
using CvSearch.Data.Entities;
|
|
using CvMatcher.Models.Settings;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.Options;
|
|
|
|
namespace Api.Services;
|
|
|
|
/// <summary>
|
|
/// Creates and validates one-time job search tokens, and creates the corresponding search sessions.
|
|
/// Provider configuration is read from <c>cvSearch.JobProviders</c> at session-creation time and
|
|
/// snapshotted into <c>JobSearchSessionEntity.ProviderConfigJson</c> so subsequent config changes
|
|
/// do not affect already-queued sessions.
|
|
/// </summary>
|
|
public sealed class JobTokenService : IJobTokenService
|
|
{
|
|
private readonly CvSearchDbContext _db;
|
|
private readonly IRagApiClient _rag;
|
|
private readonly JobSearchSettings _settings;
|
|
private readonly ILogger<JobTokenService> _logger;
|
|
|
|
public JobTokenService(
|
|
CvSearchDbContext db,
|
|
IRagApiClient rag,
|
|
IOptions<JobSearchSettings> settings,
|
|
ILogger<JobTokenService> logger)
|
|
{
|
|
_db = db;
|
|
_rag = rag;
|
|
_settings = settings.Value;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, CancellationToken ct)
|
|
{
|
|
var hasEnabledProviders = await _db.JobProviders.AnyAsync(p => p.Enabled, ct);
|
|
if (!hasEnabledProviders)
|
|
{
|
|
_logger.LogDebug("Job search token skipped — no enabled providers in cvSearch.JobProviders");
|
|
return null;
|
|
}
|
|
|
|
var token = new JobSearchTokenEntity
|
|
{
|
|
Id = Guid.NewGuid().ToString("N"),
|
|
CvDocumentId = cvDocumentId,
|
|
Email = email,
|
|
Language = language,
|
|
ExpiresAt = DateTime.UtcNow.AddDays(_settings.TokenExpiryDays),
|
|
Used = false,
|
|
CreatedAt = DateTime.UtcNow
|
|
};
|
|
|
|
_db.JobSearchTokens.Add(token);
|
|
await _db.SaveChangesAsync(ct);
|
|
_logger.LogInformation("Job search token created. TokenId={TokenId}, CvDocumentId={CvDocumentId}", token.Id, cvDocumentId);
|
|
return token.Id;
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task<string> TriggerStartAsync(string tokenId, CancellationToken ct)
|
|
{
|
|
var token = await _db.JobSearchTokens.FirstOrDefaultAsync(x => x.Id == tokenId, ct);
|
|
if (token is null) return StartJobSearchStatus.NotFound;
|
|
if (token.Used) return StartJobSearchStatus.AlreadyUsed;
|
|
if (token.ExpiresAt <= DateTime.UtcNow) return StartJobSearchStatus.Expired;
|
|
|
|
token.Used = true;
|
|
await _db.SaveChangesAsync(ct);
|
|
|
|
var cv = await _rag.GetDocumentAsync(token.CvDocumentId, ct);
|
|
var keywords = cv is not null ? ExtractKeywords(cv.Text) : string.Empty;
|
|
|
|
var enabledProviders = await _db.JobProviders
|
|
.Where(p => p.Enabled)
|
|
.OrderBy(p => p.DisplayOrder)
|
|
.ToListAsync(ct);
|
|
|
|
var providerConfigJson = JsonSerializer.Serialize(
|
|
enabledProviders.Select(ToConfig).ToList(),
|
|
new JsonSerializerOptions(JsonSerializerDefaults.Web));
|
|
|
|
var session = new JobSearchSessionEntity
|
|
{
|
|
Id = Guid.NewGuid().ToString("N"),
|
|
TokenId = token.Id,
|
|
CvDocumentId = token.CvDocumentId,
|
|
Email = token.Email,
|
|
Language = token.Language,
|
|
Status = JobSearchStatus.Pending,
|
|
Keywords = keywords,
|
|
ProviderConfigJson = providerConfigJson,
|
|
CreatedAt = DateTime.UtcNow
|
|
};
|
|
|
|
_db.JobSearchSessions.Add(session);
|
|
await _db.SaveChangesAsync(ct);
|
|
_logger.LogInformation(
|
|
"Job search session created. SessionId={SessionId}, Keywords={Keywords}, Providers={Providers}",
|
|
session.Id, keywords, string.Join(", ", enabledProviders.Select(p => p.Name)));
|
|
|
|
return StartJobSearchStatus.Started;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Maps a <see cref="JobProviderEntity"/> to the <see cref="JobProviderConfig"/> DTO used by
|
|
/// <c>cv-search-job</c>. The <c>InitialKeywords</c> list is stored as a JSON array in the entity.
|
|
/// </summary>
|
|
private static JobProviderConfig ToConfig(JobProviderEntity entity)
|
|
{
|
|
List<string> keywords;
|
|
try
|
|
{
|
|
keywords = JsonSerializer.Deserialize<List<string>>(entity.InitialKeywordsJson,
|
|
new JsonSerializerOptions(JsonSerializerDefaults.Web)) ?? [];
|
|
}
|
|
catch
|
|
{
|
|
keywords = [];
|
|
}
|
|
|
|
return new JobProviderConfig
|
|
{
|
|
Name = entity.Name,
|
|
Enabled = entity.Enabled,
|
|
SearchUrlTemplate = entity.SearchUrlTemplate,
|
|
JobLinkContains = entity.JobLinkContains,
|
|
InitialKeywords = keywords,
|
|
MaxResults = entity.MaxResults
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extracts up to 10 meaningful keywords from the CV text using simple heuristics (no LLM).
|
|
/// Samples the first 2000 characters (where title/role/skills usually appear), splits by
|
|
/// whitespace and common delimiters, strips punctuation, and deduplicates.
|
|
/// Works regardless of whether the PDF extractor preserves newlines.
|
|
/// </summary>
|
|
private static string ExtractKeywords(string cvText)
|
|
{
|
|
// Focus on the header area where name/title/skills typically appear
|
|
var sample = cvText.Length > 2000 ? cvText[..2000] : cvText;
|
|
|
|
var words = sample
|
|
.Split([' ', '\n', '\r', '\t', '|', '/', ',', ';', '(', ')'], StringSplitOptions.RemoveEmptyEntries)
|
|
.Select(w => Regex.Replace(w, @"[^\w\-]", "").Trim('-'))
|
|
.Where(w => w.Length > 2)
|
|
.Where(w => !Regex.IsMatch(w, @"^[\d\-]+$")) // skip phone fragments and pure numbers
|
|
.Where(w => !w.Contains('@') && !w.Contains('.')) // skip emails and URLs
|
|
.Distinct(StringComparer.OrdinalIgnoreCase)
|
|
.Take(10)
|
|
.ToList();
|
|
|
|
return string.Join(",", words);
|
|
}
|
|
}
|