Files
myAi/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs
T
claude 898dd09d50 feat: add page-fetcher-api — centralised Playwright page fetcher
Introduces page-fetcher-api, a new internal ASP.NET Core service that
centralises all web-page fetching through a single Playwright (headless
Chromium) browser instance. All fetches are persisted to the pageFetcher
SQL schema for auditing.

New projects:
- Apis/page-fetcher-api-models: FetchPageRequest, FetchPageResponse, IPageFetcherApiClient
- Apis/page-fetcher-data: PageFetchDbContext, PageFetchEntity, InitialSchema migration (schema: pageFetcher)
- Apis/page-fetcher-api: PlaywrightBrowserService (singleton), PageFetcherService, PageController

Changes to existing services:
- cv-matcher-api: JobTextExtractor now calls IPageFetcherApiClient instead of HttpClient
- cv-search-job: HtmlJobSearcher uses IPageFetcherApiClient (removes inline Playwright);
  CvSearchJobTask fetches individual job pages and applies keyword pre-filter before
  LLM call; passes pre-fetched JobDescription to cv-matcher-api to skip re-fetch
- common: add PageFetcherApiSettings
- docker-compose.yml, build.yml: add new service + env vars for callers

Closes #43

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 17:43:56 +03:00

293 lines
12 KiB
C#

using System.Text.Json;
using CvMatcher.Models.Requests;
using CvSearch.Data;
using CvSearch.Data.Entities;
using CvMatcher.Models.Settings;
using CvSearchJob.Clients;
using CvSearchJob.Services;
using JobScheduler.Tasks;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using PageFetcher.Models;
namespace CvSearchJob.Tasks;
/// <summary>
/// Background job task that processes pending job search sessions: scrapes providers,
/// scores each URL against the CV via the matcher API, persists results, and sends the results email.
/// </summary>
public sealed class CvSearchJobTask : IJobTask
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly JobSearchSettings _settings;
private readonly HtmlJobSearcher _searcher;
private readonly ICvMatcherInternalApi _matcherApi;
private readonly IPageFetcherApiClient _pageFetcher;
private readonly CvSearchEmailSender _emailSender;
private readonly ILogger<CvSearchJobTask> _logger;
public string TaskType => "CvSearch";
public CvSearchJobTask(
IServiceScopeFactory scopeFactory,
IOptions<JobSearchSettings> settings,
HtmlJobSearcher searcher,
ICvMatcherInternalApi matcherApi,
IPageFetcherApiClient pageFetcher,
CvSearchEmailSender emailSender,
ILogger<CvSearchJobTask> logger)
{
_scopeFactory = scopeFactory;
_settings = settings.Value;
_searcher = searcher;
_matcherApi = matcherApi;
_pageFetcher = pageFetcher;
_emailSender = emailSender;
_logger = logger;
}
/// <summary>
/// Called by the scheduler on each tick. Resets orphaned sessions, picks the oldest pending session,
/// runs the full search pipeline, and sends the results email.
/// Does nothing when <c>JobSearch:Enabled</c> is <c>false</c>.
/// </summary>
public async Task ExecuteAsync(IConfiguration parametersSection, CancellationToken cancellationToken)
{
if (!_settings.Enabled) return;
using var scope = _scopeFactory.CreateScope();
var db = scope.ServiceProvider.GetRequiredService<CvSearchDbContext>();
// Recover orphaned Processing sessions (container crashed mid-run)
var stuckCutoff = DateTime.UtcNow.AddMinutes(-10);
var stuckSessions = await db.JobSearchSessions
.Where(s => s.Status == JobSearchStatus.Processing && s.CreatedAt < stuckCutoff)
.ToListAsync(cancellationToken);
foreach (var stuck in stuckSessions)
{
stuck.Status = JobSearchStatus.Pending;
_logger.LogWarning("Reset stuck session {SessionId} back to Pending", stuck.Id);
}
if (stuckSessions.Count > 0)
await db.SaveChangesAsync(cancellationToken);
var pending = await db.JobSearchSessions
.Where(s => s.Status == JobSearchStatus.Pending)
.OrderBy(s => s.CreatedAt)
.Take(1)
.FirstOrDefaultAsync(cancellationToken);
if (pending is null) return;
_logger.LogInformation("Processing job search session {SessionId}", pending.Id);
pending.Status = JobSearchStatus.Processing;
await db.SaveChangesAsync(cancellationToken);
try
{
var cvKeywords = pending.Keywords
.Split(',', StringSplitOptions.RemoveEmptyEntries)
.Select(k => k.Trim())
.Where(k => k.Length > 0)
.ToList();
var providers = GetProviders(pending.ProviderConfigJson);
_logger.LogInformation(
"Session {SessionId}: keywords=[{Keywords}] | providers=[{Providers}]",
pending.Id,
cvKeywords.Count > 0 ? string.Join(", ", cvKeywords) : "(none)",
providers.Count > 0 ? string.Join(", ", providers.Select(p => p.Name)) : "(none)");
var results = await RunSearchAsync(pending, cvKeywords, providers, db, cancellationToken);
pending.Status = JobSearchStatus.Done;
await db.SaveChangesAsync(cancellationToken);
var attachmentFileName = BuildCvFileName(pending.CvDocumentId);
await _emailSender.SendResultsAsync(
pending.Email,
attachmentFileName,
results,
cvKeywords,
providers.Select(p => p.Name).ToList(),
pending.Language,
pending.Location,
cancellationToken);
_logger.LogInformation("Session {SessionId} done. {Count} results sent.", pending.Id, results.Count);
}
catch (Exception ex)
{
_logger.LogError(ex, "Session {SessionId} failed.", pending.Id);
pending.Status = JobSearchStatus.Failed;
await db.SaveChangesAsync(cancellationToken);
}
}
/// <summary>
/// Runs the full search pipeline for a session: scrapes all providers, deduplicates URLs,
/// fetches each individual job page via page-fetcher-api, applies a keyword pre-filter,
/// scores passing candidates via the matcher API, and persists results that meet the minimum score threshold.
/// </summary>
private async Task<List<JobSearchResultEntity>> RunSearchAsync(
JobSearchSessionEntity session,
List<string> cvKeywords,
List<JobProviderConfig> providers,
CvSearchDbContext db,
CancellationToken ct)
{
if (cvKeywords.Count == 0)
_logger.LogWarning("Session {SessionId}: keyword list is empty — scraper will rely on provider InitialKeywords only", session.Id);
var jobCandidates = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase); // url → title
foreach (var provider in providers)
{
var candidates = await _searcher.SearchJobUrlsAsync(provider, cvKeywords, session.Location, ct);
_logger.LogInformation("Session {SessionId}: provider {Provider} returned {Count} candidates", session.Id, provider.Name, candidates.Count);
foreach (var c in candidates)
jobCandidates.TryAdd(c.Url, c.Title);
}
var deduped = jobCandidates.Take(_settings.MaxJobsToMatch).ToList();
_logger.LogInformation(
"Session {SessionId}: {Total} unique URLs across all providers, processing up to {Cap}",
session.Id, jobCandidates.Count, deduped.Count);
var results = new List<JobSearchResultEntity>();
foreach (var (url, title) in deduped)
{
try
{
// Fetch individual job page text via page-fetcher-api
var fetchResponse = await _pageFetcher.FetchAsync(new FetchPageRequest
{
Url = url,
WaitFor = "domcontentloaded",
CallerService = "cv-search-job"
}, ct);
if (!fetchResponse.Success || string.IsNullOrWhiteSpace(fetchResponse.Text))
{
_logger.LogWarning("Session {SessionId}: fetch failed for {Url} — {Error}", session.Id, url, fetchResponse.Error);
continue;
}
var jobText = fetchResponse.Text;
// Keyword pre-filter: skip LLM call if no CV keyword appears in the job page text
if (cvKeywords.Count > 0 &&
!cvKeywords.Any(k => jobText.Contains(k, StringComparison.OrdinalIgnoreCase)))
{
_logger.LogInformation(
"Session {SessionId}: pre-filter skip | {Url} | no CV keyword found in job text",
session.Id, url);
continue;
}
var matchRequest = new MatchJobRequest
{
CvDocumentId = session.CvDocumentId,
JobUrl = url,
// Pre-fetched text passed directly so cv-matcher-api skips re-fetching the page
JobDescription = jobText,
// User already gave GDPR consent when they clicked the one-time job search link
GdprConsent = true
};
var matchResult = await _matcherApi.MatchJobAsync(matchRequest, ct);
_logger.LogInformation(
"Session {SessionId}: {Url} → score={Score}% (threshold={Threshold}%) {Verdict}",
session.Id, url, matchResult.Score, _settings.MinMatchScore,
matchResult.Score >= _settings.MinMatchScore ? "ACCEPTED" : "rejected");
if (matchResult.Score < _settings.MinMatchScore)
continue;
var entity = new JobSearchResultEntity
{
Id = Guid.NewGuid().ToString("N"),
SessionId = session.Id,
ProviderName = GuessProvider(url, providers),
JobUrl = url,
JobTitle = matchResult.Summary.Split('.').FirstOrDefault()?.Trim() ?? title,
JobText = string.Empty,
Score = matchResult.Score,
ResultJson = JsonSerializer.Serialize(matchResult, new JsonSerializerOptions(JsonSerializerDefaults.Web)),
CreatedAt = DateTime.UtcNow
};
db.JobSearchResults.Add(entity);
await db.SaveChangesAsync(ct);
results.Add(entity);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Session {SessionId}: match failed for {Url}", session.Id, url);
}
}
results.Sort((a, b) => b.Score.CompareTo(a.Score));
return results;
}
/// <summary>
/// Deserialises the provider configuration snapshot stored on the session.
/// Providers are always snapshotted from the DB at session-creation time, so the snapshot
/// should always be present. Returns an empty list (with a warning) when it is missing or corrupt.
/// </summary>
private List<JobProviderConfig> GetProviders(string? providerConfigJson)
{
if (string.IsNullOrWhiteSpace(providerConfigJson))
{
_logger.LogWarning("Session has no provider config snapshot — returning empty provider list");
return [];
}
try
{
return JsonSerializer.Deserialize<List<JobProviderConfig>>(providerConfigJson,
new JsonSerializerOptions(JsonSerializerDefaults.Web))
?? [];
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to deserialise provider config snapshot — returning empty provider list");
return [];
}
}
/// <summary>
/// Infers the provider name from the job URL by matching against each provider's <c>JobLinkContains</c> pattern.
/// Falls back to the URL hostname when no provider matches.
/// </summary>
private static string GuessProvider(string url, List<JobProviderConfig> providers)
{
foreach (var p in providers)
{
if (!string.IsNullOrWhiteSpace(p.JobLinkContains) &&
url.Contains(p.JobLinkContains, StringComparison.OrdinalIgnoreCase))
return p.Name;
}
return Uri.TryCreate(url, UriKind.Absolute, out var uri) ? uri.Host : "unknown";
}
/// <summary>
/// Constructs the CV PDF filename from the document ID.
/// </summary>
private static string BuildCvFileName(string cvDocumentId)
{
// Strip non-alphanumeric characters so the filename is safe for all OS/email clients.
var safeId = string.Concat(cvDocumentId.Where(char.IsLetterOrDigit));
if (string.IsNullOrWhiteSpace(safeId)) safeId = "cv";
return $"{safeId}.pdf";
}
}