Files
myAi/Apis/cv-matcher-api/Services/JobTokenService.cs
T
claude e38f40732f
Build and Push Docker Images Staging / build (push) Successful in 5m20s
feat(providers): add headless browser scraping via Playwright for SPA job sites
ejobs.ro migrated to a Nuxt SPA - plain HTTP GET returns only the JS
bundle. This change equips cv-search-job with a headless Chromium
(Playwright 1.60) so it can fully render SPA pages before extracting
job links.

- Add UseHeadlessBrowser flag to JobProviderEntity, JobProviderConfig,
  and CvSearchDbContext; map it in JobTokenService.ToConfig so the flag
  is included in the session provider-config snapshot
- Migration: add UseHeadlessBrowser column; fix ejobs.ro search URL
  (remove /user/ prefix that caused 404) and set UseHeadlessBrowser=true
- HtmlJobSearcher: detect flag and dispatch to FetchWithPlaywrightAsync;
  plain-HTTP path is unchanged; NetworkIdle timeout falls back to partial
  content rather than failing outright
- Dockerfile: download Playwright Chromium in the SDK build stage via
  npx; copy browser binaries to the final image; install Chromium system
  libs (Ubuntu noble t64 variants)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 13:42:52 +03:00

134 lines
4.8 KiB
C#

using System.Text.Json;
using Api.Services.Contracts;
using CvMatcher.Models.Responses;
using CvSearch.Data;
using CvSearch.Data.Entities;
using CvMatcher.Models.Settings;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Options;
namespace Api.Services;
/// <summary>
/// Creates and validates one-time job search tokens, and creates the corresponding search sessions.
/// Provider configuration is read from <c>cvSearch.JobProviders</c> at session-creation time and
/// snapshotted into <c>JobSearchSessionEntity.ProviderConfigJson</c> so subsequent config changes
/// do not affect already-queued sessions.
/// Keywords are extracted by the LLM during the CV-to-job match call and stored on the token,
/// then copied to the session when the user clicks the link — no extra RAG call needed.
/// </summary>
public sealed class JobTokenService : IJobTokenService
{
private readonly CvSearchDbContext _db;
private readonly JobSearchSettings _settings;
private readonly ILogger<JobTokenService> _logger;
public JobTokenService(
CvSearchDbContext db,
IOptions<JobSearchSettings> settings,
ILogger<JobTokenService> logger)
{
_db = db;
_settings = settings.Value;
_logger = logger;
}
/// <inheritdoc />
public async Task<string?> CreateTokenAsync(string cvDocumentId, string email, string language, IReadOnlyList<string> keywords, CancellationToken ct)
{
var hasEnabledProviders = await _db.JobProviders.AnyAsync(p => p.Enabled, ct);
if (!hasEnabledProviders)
{
_logger.LogDebug("Job search token skipped — no enabled providers in cvSearch.JobProviders");
return null;
}
var token = new JobSearchTokenEntity
{
Id = Guid.NewGuid().ToString("N"),
CvDocumentId = cvDocumentId,
Email = email,
Language = language,
Keywords = string.Join(",", keywords),
ExpiresAt = DateTime.UtcNow.AddDays(_settings.TokenExpiryDays),
Used = false,
CreatedAt = DateTime.UtcNow
};
_db.JobSearchTokens.Add(token);
await _db.SaveChangesAsync(ct);
_logger.LogInformation("Job search token created. TokenId={TokenId}, CvDocumentId={CvDocumentId}, Keywords={Keywords}", token.Id, cvDocumentId, token.Keywords);
return token.Id;
}
/// <inheritdoc />
public async Task<string> TriggerStartAsync(string tokenId, CancellationToken ct)
{
var token = await _db.JobSearchTokens.FirstOrDefaultAsync(x => x.Id == tokenId, ct);
if (token is null) return StartJobSearchStatus.NotFound;
if (token.Used) return StartJobSearchStatus.AlreadyUsed;
if (token.ExpiresAt <= DateTime.UtcNow) return StartJobSearchStatus.Expired;
token.Used = true;
await _db.SaveChangesAsync(ct);
var keywords = token.Keywords;
var enabledProviders = await _db.JobProviders
.Where(p => p.Enabled)
.OrderBy(p => p.DisplayOrder)
.ToListAsync(ct);
var providerConfigJson = JsonSerializer.Serialize(
enabledProviders.Select(ToConfig).ToList(),
new JsonSerializerOptions(JsonSerializerDefaults.Web));
var session = new JobSearchSessionEntity
{
Id = Guid.NewGuid().ToString("N"),
TokenId = token.Id,
CvDocumentId = token.CvDocumentId,
Email = token.Email,
Language = token.Language,
Status = JobSearchStatus.Pending,
Keywords = keywords,
ProviderConfigJson = providerConfigJson,
CreatedAt = DateTime.UtcNow
};
_db.JobSearchSessions.Add(session);
await _db.SaveChangesAsync(ct);
_logger.LogInformation(
"Job search session created. SessionId={SessionId}, Keywords={Keywords}, Providers={Providers}",
session.Id, keywords, string.Join(", ", enabledProviders.Select(p => p.Name)));
return StartJobSearchStatus.Started;
}
private static JobProviderConfig ToConfig(JobProviderEntity entity)
{
List<string> keywords;
try
{
keywords = JsonSerializer.Deserialize<List<string>>(entity.InitialKeywordsJson,
new JsonSerializerOptions(JsonSerializerDefaults.Web)) ?? [];
}
catch
{
keywords = [];
}
return new JobProviderConfig
{
Name = entity.Name,
Enabled = entity.Enabled,
SearchUrlTemplate = entity.SearchUrlTemplate,
JobLinkContains = entity.JobLinkContains,
InitialKeywords = keywords,
MaxResults = entity.MaxResults,
UseHeadlessBrowser = entity.UseHeadlessBrowser
};
}
}