diff --git a/Jobs/cv-search-job/Services/CvSearchEmailSender.cs b/Jobs/cv-search-job/Services/CvSearchEmailSender.cs index fd95104..48999c5 100644 --- a/Jobs/cv-search-job/Services/CvSearchEmailSender.cs +++ b/Jobs/cv-search-job/Services/CvSearchEmailSender.cs @@ -35,12 +35,16 @@ public sealed class CvSearchEmailSender /// Primary recipient (the user who triggered the search). /// Relative filename of the CV PDF to attach, or null. /// Ranked list of job search results to include in the email body. + /// CV keywords used to drive the job search. + /// Names of the providers that were scanned. /// Two-letter language code for template rendering. /// Cancellation token. public async Task SendResultsAsync( string toEmail, string? attachmentFileName, IReadOnlyList results, + IReadOnlyList keywords, + IReadOnlyList providerNames, string language, CancellationToken ct) { @@ -54,7 +58,7 @@ public sealed class CvSearchEmailSender if (recipients.Count == 0) return; - var htmlBody = BuildBody(results, language); + var htmlBody = BuildBody(results, keywords, providerNames, language); var subject = _emailTemplates.Render("email.search-results.subject", language, ("count", results.Count.ToString())); @@ -81,11 +85,14 @@ public sealed class CvSearchEmailSender /// /// Renders the HTML email body from the results list. /// Returns the empty-results template when no results are present. + /// Prepends a scan summary block showing the keywords and providers used. /// - private string BuildBody(IReadOnlyList results, string language) + private string BuildBody(IReadOnlyList results, IReadOnlyList keywords, IReadOnlyList providerNames, string language) { + var scanSummary = BuildScanSummary(keywords, providerNames); + if (results.Count == 0) - return _emailTemplates.Get("email.search-results.empty", language); + return scanSummary + _emailTemplates.Get("email.search-results.empty", language); var items = new System.Text.StringBuilder(); for (int i = 0; i < results.Count; i++) @@ -107,7 +114,29 @@ public sealed class CvSearchEmailSender return _emailTemplates.Render("email.search-results.body", language, ("count", results.Count.ToString()), - ("items", items.ToString())); + ("items", scanSummary + items.ToString())); + } + + /// + /// Builds the scan summary block showing the CV keywords and providers used for the search. + /// + private static string BuildScanSummary(IReadOnlyList keywords, IReadOnlyList providerNames) + { + var keywordsHtml = keywords.Count > 0 + ? string.Join("", keywords.Select(k => + $"{k}")) + : "none detected"; + + var providersText = providerNames.Count > 0 + ? string.Join(", ", providerNames) + : "none"; + + return $""" +
+
Keywords used: {keywordsHtml}
+
Providers scanned: {providersText}
+
+ """; } /// diff --git a/Jobs/cv-search-job/Services/HtmlJobSearcher.cs b/Jobs/cv-search-job/Services/HtmlJobSearcher.cs index d3dcd5d..c99fc62 100644 --- a/Jobs/cv-search-job/Services/HtmlJobSearcher.cs +++ b/Jobs/cv-search-job/Services/HtmlJobSearcher.cs @@ -44,19 +44,27 @@ public sealed class HtmlJobSearcher .ToList(); if (allKeywords.Count == 0) + { + _logger.LogWarning("Provider {Provider}: no keywords available (CV keywords empty, InitialKeywords empty), skipping", provider.Name); return []; + } var keywordsEncoded = HttpUtility.UrlEncode(string.Join(" ", allKeywords)); var searchUrl = provider.SearchUrlTemplate.Replace("{keywords}", keywordsEncoded); + _logger.LogInformation( + "Provider {Provider}: fetching {Url} | CV keywords: [{Keywords}]", + provider.Name, searchUrl, string.Join(", ", cvKeywords)); + string html; try { html = await _http.GetStringAsync(searchUrl, ct); + _logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length); } catch (Exception ex) { - _logger.LogWarning(ex, "Failed to fetch search results from {Provider} at {Url}", provider.Name, searchUrl); + _logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", provider.Name, searchUrl); return []; } @@ -68,7 +76,11 @@ public sealed class HtmlJobSearcher var anchorPattern = new Regex(@"]+href=[""']([^""']+)[""'][^>]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline); - foreach (Match match in anchorPattern.Matches(html)) + var allAnchors = anchorPattern.Matches(html); + var stage1Pass = 0; + var stage2Pass = 0; + + foreach (Match match in allAnchors) { if (results.Count >= provider.MaxResults) break; @@ -78,9 +90,18 @@ public sealed class HtmlJobSearcher if (!href.Contains(provider.JobLinkContains, StringComparison.OrdinalIgnoreCase)) continue; + stage1Pass++; + // Stage 2: anchor text must contain at least one CV keyword if (!cvKeywords.Any(k => anchorText.Contains(k, StringComparison.OrdinalIgnoreCase))) + { + _logger.LogDebug( + "Provider {Provider}: stage-2 reject | href={Href} | text={Text}", + provider.Name, href, anchorText.Length > 100 ? anchorText[..100] : anchorText); continue; + } + + stage2Pass++; // Make absolute URL if (!Uri.TryCreate(href, UriKind.Absolute, out var absoluteUri)) @@ -95,7 +116,10 @@ public sealed class HtmlJobSearcher results.Add(url); } - _logger.LogInformation("Provider {Provider}: found {Count} job URLs", provider.Name, results.Count); + _logger.LogInformation( + "Provider {Provider}: {TotalAnchors} anchors found | {Stage1} passed href filter ('{LinkPattern}') | {Stage2} passed keyword filter | {Unique} unique URLs returned", + provider.Name, allAnchors.Count, stage1Pass, provider.JobLinkContains, stage2Pass, results.Count); + return results; } } diff --git a/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs b/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs index 76791be..eb1ce80 100644 --- a/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs +++ b/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs @@ -84,13 +84,35 @@ public sealed class CvSearchJobTask : IJobTask try { - var results = await RunSearchAsync(pending, db, cancellationToken); + var cvKeywords = pending.Keywords + .Split(',', StringSplitOptions.RemoveEmptyEntries) + .Select(k => k.Trim()) + .Where(k => k.Length > 0) + .ToList(); + + var providers = GetProviders(pending.ProviderConfigJson); + + _logger.LogInformation( + "Session {SessionId}: keywords=[{Keywords}] | providers=[{Providers}]", + pending.Id, + cvKeywords.Count > 0 ? string.Join(", ", cvKeywords) : "(none)", + providers.Count > 0 ? string.Join(", ", providers.Select(p => p.Name)) : "(none)"); + + var results = await RunSearchAsync(pending, cvKeywords, providers, db, cancellationToken); pending.Status = JobSearchStatus.Done; await db.SaveChangesAsync(cancellationToken); var attachmentFileName = BuildCvFileName(pending.CvDocumentId); - await _emailSender.SendResultsAsync(pending.Email, attachmentFileName, results, pending.Language, cancellationToken); + await _emailSender.SendResultsAsync( + pending.Email, + attachmentFileName, + results, + cvKeywords, + providers.Select(p => p.Name).ToList(), + pending.Language, + cancellationToken); + _logger.LogInformation("Session {SessionId} done. {Count} results sent.", pending.Id, results.Count); } catch (Exception ex) @@ -107,26 +129,27 @@ public sealed class CvSearchJobTask : IJobTask /// private async Task> RunSearchAsync( JobSearchSessionEntity session, + List cvKeywords, + List providers, CvSearchDbContext db, CancellationToken ct) { - var cvKeywords = session.Keywords - .Split(',', StringSplitOptions.RemoveEmptyEntries) - .Select(k => k.Trim()) - .Where(k => k.Length > 0) - .ToList(); + if (cvKeywords.Count == 0) + _logger.LogWarning("Session {SessionId}: keyword list is empty — scraper will rely on provider InitialKeywords only", session.Id); - var providers = GetProviders(session.ProviderConfigJson); var jobUrls = new HashSet(StringComparer.OrdinalIgnoreCase); foreach (var provider in providers) { var urls = await _searcher.SearchJobUrlsAsync(provider, cvKeywords, ct); + _logger.LogInformation("Session {SessionId}: provider {Provider} returned {Count} URLs", session.Id, provider.Name, urls.Count); foreach (var url in urls) jobUrls.Add(url); } var candidates = jobUrls.Take(_settings.MaxJobsToMatch).ToList(); - _logger.LogInformation("Session {SessionId}: {Count} candidate job URLs to match", session.Id, candidates.Count); + _logger.LogInformation( + "Session {SessionId}: {Total} unique URLs across all providers, scoring {Scoring} (cap={Cap})", + session.Id, jobUrls.Count, candidates.Count, _settings.MaxJobsToMatch); var results = new List(); @@ -143,11 +166,14 @@ public sealed class CvSearchJobTask : IJobTask }; var matchResult = await _matcherApi.MatchJobAsync(matchRequest, ct); + + _logger.LogInformation( + "Session {SessionId}: {Url} → score={Score}% (threshold={Threshold}%) {Verdict}", + session.Id, url, matchResult.Score, _settings.MinMatchScore, + matchResult.Score >= _settings.MinMatchScore ? "ACCEPTED" : "rejected"); + if (matchResult.Score < _settings.MinMatchScore) - { - _logger.LogDebug("Session {SessionId}: {Url} scored {Score}% (below threshold)", session.Id, url, matchResult.Score); continue; - } var entity = new JobSearchResultEntity { diff --git a/Jobs/cv-search-job/cv-search-job.csproj b/Jobs/cv-search-job/cv-search-job.csproj index 982a4a6..120418e 100644 --- a/Jobs/cv-search-job/cv-search-job.csproj +++ b/Jobs/cv-search-job/cv-search-job.csproj @@ -21,10 +21,10 @@ - +