diff --git a/Apis/cv-matcher-api-models/Settings/JobSearchSettings.cs b/Apis/cv-matcher-api-models/Settings/JobSearchSettings.cs index e81b3a9..96ed11e 100644 --- a/Apis/cv-matcher-api-models/Settings/JobSearchSettings.cs +++ b/Apis/cv-matcher-api-models/Settings/JobSearchSettings.cs @@ -21,4 +21,6 @@ public sealed class JobProviderConfig public string JobLinkContains { get; set; } = string.Empty; public List InitialKeywords { get; set; } = []; public int MaxResults { get; set; } = 20; + /// When true the scraper uses a headless Chromium browser to render JS-heavy pages. + public bool UseHeadlessBrowser { get; set; } } diff --git a/Apis/cv-matcher-api/Services/JobTokenService.cs b/Apis/cv-matcher-api/Services/JobTokenService.cs index e2a07d8..d8856ac 100644 --- a/Apis/cv-matcher-api/Services/JobTokenService.cs +++ b/Apis/cv-matcher-api/Services/JobTokenService.cs @@ -125,7 +125,8 @@ public sealed class JobTokenService : IJobTokenService SearchUrlTemplate = entity.SearchUrlTemplate, JobLinkContains = entity.JobLinkContains, InitialKeywords = keywords, - MaxResults = entity.MaxResults + MaxResults = entity.MaxResults, + UseHeadlessBrowser = entity.UseHeadlessBrowser }; } diff --git a/Apis/cv-search-data/Data/CvSearchDbContext.cs b/Apis/cv-search-data/Data/CvSearchDbContext.cs index 6bc1133..172d22d 100644 --- a/Apis/cv-search-data/Data/CvSearchDbContext.cs +++ b/Apis/cv-search-data/Data/CvSearchDbContext.cs @@ -79,6 +79,7 @@ public sealed class CvSearchDbContext : DbContext entity.Property(x => x.InitialKeywordsJson).HasMaxLength(2000).HasDefaultValue("[]").IsRequired(); entity.Property(x => x.MaxResults).HasDefaultValue(20); entity.Property(x => x.DisplayOrder).HasDefaultValue(0); + entity.Property(x => x.UseHeadlessBrowser).HasDefaultValue(false); }); } } diff --git a/Apis/cv-search-data/Data/Entities/JobProviderEntity.cs b/Apis/cv-search-data/Data/Entities/JobProviderEntity.cs index 79e76e2..2697be4 100644 --- a/Apis/cv-search-data/Data/Entities/JobProviderEntity.cs +++ b/Apis/cv-search-data/Data/Entities/JobProviderEntity.cs @@ -30,4 +30,7 @@ public sealed class JobProviderEntity /// Controls display ordering in future admin UIs. public int DisplayOrder { get; set; } + + /// When true, the scraper renders the page with headless Chromium instead of a plain HTTP GET. + public bool UseHeadlessBrowser { get; set; } } diff --git a/Apis/cv-search-data/Migrations/20260529170000_AddHeadlessBrowserToProviders.Designer.cs b/Apis/cv-search-data/Migrations/20260529170000_AddHeadlessBrowserToProviders.Designer.cs new file mode 100644 index 0000000..a7b54b7 --- /dev/null +++ b/Apis/cv-search-data/Migrations/20260529170000_AddHeadlessBrowserToProviders.Designer.cs @@ -0,0 +1,234 @@ +// +using System; +using CvSearch.Data; +using Microsoft.EntityFrameworkCore; +using Microsoft.EntityFrameworkCore.Infrastructure; +using Microsoft.EntityFrameworkCore.Metadata; +using Microsoft.EntityFrameworkCore.Migrations; +using Microsoft.EntityFrameworkCore.Storage.ValueConversion; + +#nullable disable + +namespace CvSearch.Data.Migrations +{ + [DbContext(typeof(CvSearchDbContext))] + [Migration("20260529170000_AddHeadlessBrowserToProviders")] + partial class AddHeadlessBrowserToProviders + { + /// + protected override void BuildTargetModel(ModelBuilder modelBuilder) + { +#pragma warning disable 612, 618 + modelBuilder + .HasDefaultSchema("cvSearch") + .HasAnnotation("ProductVersion", "10.0.7") + .HasAnnotation("Relational:MaxIdentifierLength", 128); + + SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); + + modelBuilder.Entity("CvSearch.Data.Entities.JobProviderEntity", b => + { + b.Property("Id") + .ValueGeneratedOnAdd() + .HasColumnType("int"); + + SqlServerPropertyBuilderExtensions.UseIdentityColumn(b.Property("Id")); + + b.Property("DisplayOrder") + .ValueGeneratedOnAdd() + .HasColumnType("int") + .HasDefaultValue(0); + + b.Property("Enabled") + .HasColumnType("bit"); + + b.Property("InitialKeywordsJson") + .IsRequired() + .ValueGeneratedOnAdd() + .HasMaxLength(2000) + .HasColumnType("nvarchar(2000)") + .HasDefaultValue("[]"); + + b.Property("JobLinkContains") + .IsRequired() + .HasMaxLength(256) + .HasColumnType("nvarchar(256)"); + + b.Property("MaxResults") + .ValueGeneratedOnAdd() + .HasColumnType("int") + .HasDefaultValue(20); + + b.Property("Name") + .IsRequired() + .HasMaxLength(128) + .HasColumnType("nvarchar(128)"); + + b.Property("SearchUrlTemplate") + .IsRequired() + .HasMaxLength(1024) + .HasColumnType("nvarchar(1024)"); + + b.Property("UseHeadlessBrowser") + .ValueGeneratedOnAdd() + .HasColumnType("bit") + .HasDefaultValue(false); + + b.HasKey("Id"); + + b.ToTable("JobProviders", "cvSearch"); + }); + + modelBuilder.Entity("CvSearch.Data.Entities.JobSearchResultEntity", b => + { + b.Property("Id") + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CreatedAt") + .ValueGeneratedOnAdd() + .HasColumnType("datetime2") + .HasDefaultValueSql("SYSUTCDATETIME()"); + + b.Property("JobText") + .IsRequired() + .HasColumnType("nvarchar(max)"); + + b.Property("JobTitle") + .IsRequired() + .HasMaxLength(512) + .HasColumnType("nvarchar(512)"); + + b.Property("JobUrl") + .IsRequired() + .HasMaxLength(2048) + .HasColumnType("nvarchar(2048)"); + + b.Property("ProviderName") + .IsRequired() + .HasMaxLength(128) + .HasColumnType("nvarchar(128)"); + + b.Property("ResultJson") + .IsRequired() + .HasColumnType("nvarchar(max)"); + + b.Property("Score") + .HasColumnType("int"); + + b.Property("SessionId") + .IsRequired() + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.HasKey("Id"); + + b.HasIndex("SessionId"); + + b.ToTable("JobSearchResults", "cvSearch"); + }); + + modelBuilder.Entity("CvSearch.Data.Entities.JobSearchSessionEntity", b => + { + b.Property("Id") + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CreatedAt") + .ValueGeneratedOnAdd() + .HasColumnType("datetime2") + .HasDefaultValueSql("SYSUTCDATETIME()"); + + b.Property("CvDocumentId") + .IsRequired() + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("Email") + .IsRequired() + .HasMaxLength(256) + .HasColumnType("nvarchar(256)"); + + b.Property("Keywords") + .IsRequired() + .HasMaxLength(1000) + .HasColumnType("nvarchar(1000)"); + + b.Property("Language") + .IsRequired() + .ValueGeneratedOnAdd() + .HasMaxLength(8) + .HasColumnType("nvarchar(8)") + .HasDefaultValue("en"); + + b.Property("ProviderConfigJson") + .HasColumnType("nvarchar(max)"); + + b.Property("Status") + .IsRequired() + .HasMaxLength(32) + .HasColumnType("nvarchar(32)"); + + b.Property("TokenId") + .IsRequired() + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.HasKey("Id"); + + b.HasIndex("Status"); + + b.ToTable("JobSearchSessions", "cvSearch"); + }); + + modelBuilder.Entity("CvSearch.Data.Entities.JobSearchTokenEntity", b => + { + b.Property("Id") + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CreatedAt") + .ValueGeneratedOnAdd() + .HasColumnType("datetime2") + .HasDefaultValueSql("SYSUTCDATETIME()"); + + b.Property("CvDocumentId") + .IsRequired() + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("Email") + .IsRequired() + .HasMaxLength(256) + .HasColumnType("nvarchar(256)"); + + b.Property("ExpiresAt") + .HasColumnType("datetime2"); + + b.Property("Keywords") + .IsRequired() + .ValueGeneratedOnAdd() + .HasMaxLength(1000) + .HasColumnType("nvarchar(1000)") + .HasDefaultValue(""); + + b.Property("Language") + .IsRequired() + .ValueGeneratedOnAdd() + .HasMaxLength(8) + .HasColumnType("nvarchar(8)") + .HasDefaultValue("en"); + + b.Property("Used") + .ValueGeneratedOnAdd() + .HasColumnType("bit") + .HasDefaultValue(false); + + b.HasKey("Id"); + + b.ToTable("JobSearchTokens", "cvSearch"); + }); +#pragma warning restore 612, 618 + } + } +} diff --git a/Apis/cv-search-data/Migrations/20260529170000_AddHeadlessBrowserToProviders.cs b/Apis/cv-search-data/Migrations/20260529170000_AddHeadlessBrowserToProviders.cs new file mode 100644 index 0000000..a8a42f3 --- /dev/null +++ b/Apis/cv-search-data/Migrations/20260529170000_AddHeadlessBrowserToProviders.cs @@ -0,0 +1,50 @@ +using Microsoft.EntityFrameworkCore.Migrations; + +#nullable disable + +namespace CvSearch.Data.Migrations +{ + /// + public partial class AddHeadlessBrowserToProviders : Migration + { + /// + protected override void Up(MigrationBuilder migrationBuilder) + { + migrationBuilder.AddColumn( + name: "UseHeadlessBrowser", + schema: MigrationConstants.SchemaName, + table: "JobProviders", + type: "bit", + nullable: false, + defaultValue: false); + + // ejobs.ro (Id=1) is a Nuxt SPA — the old /user/ URL 404s and plain HTTP GET + // returns only the JS bundle, not actual job listings. + // Fix: use the correct search URL and headless Chromium to render job results. + migrationBuilder.UpdateData( + schema: MigrationConstants.SchemaName, + table: "JobProviders", + keyColumn: "Id", + keyValue: 1, + columns: ["SearchUrlTemplate", "JobLinkContains", "UseHeadlessBrowser"], + values: new object[] { "https://www.ejobs.ro/locuri-de-munca?q={keywords}", "/locuri-de-munca/", true }); + } + + /// + protected override void Down(MigrationBuilder migrationBuilder) + { + migrationBuilder.UpdateData( + schema: MigrationConstants.SchemaName, + table: "JobProviders", + keyColumn: "Id", + keyValue: 1, + columns: ["SearchUrlTemplate", "JobLinkContains", "UseHeadlessBrowser"], + values: new object[] { "https://www.ejobs.ro/user/locuri-de-munca/?utm_source=myai&q={keywords}", "/user/locuri-de-munca/", false }); + + migrationBuilder.DropColumn( + name: "UseHeadlessBrowser", + schema: MigrationConstants.SchemaName, + table: "JobProviders"); + } + } +} diff --git a/Apis/cv-search-data/Migrations/CvSearchDbContextModelSnapshot.cs b/Apis/cv-search-data/Migrations/CvSearchDbContextModelSnapshot.cs index 9005fb5..6d5b927 100644 --- a/Apis/cv-search-data/Migrations/CvSearchDbContextModelSnapshot.cs +++ b/Apis/cv-search-data/Migrations/CvSearchDbContextModelSnapshot.cs @@ -66,6 +66,11 @@ namespace CvSearch.Data.Migrations .HasMaxLength(1024) .HasColumnType("nvarchar(1024)"); + b.Property("UseHeadlessBrowser") + .ValueGeneratedOnAdd() + .HasColumnType("bit") + .HasDefaultValue(false); + b.HasKey("Id"); b.ToTable("JobProviders", "cvSearch"); diff --git a/Directory.Packages.props b/Directory.Packages.props index 7e06527..4d7569e 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -32,6 +32,8 @@ + + diff --git a/Jobs/cv-search-job/Dockerfile b/Jobs/cv-search-job/Dockerfile index 10e7698..a9fa827 100644 --- a/Jobs/cv-search-job/Dockerfile +++ b/Jobs/cv-search-job/Dockerfile @@ -29,9 +29,28 @@ COPY Helpers/startup-helpers/ Helpers/startup-helpers/ RUN dotnet publish Jobs/cv-search-job/cv-search-job.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false +# Download Playwright Chromium browser in the build stage. +# Node.js is only needed here to run npx — it is not copied to the final image. +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \ + && npx --yes playwright@1.60.0 install chromium \ + && rm -rf /var/lib/apt/lists/* + FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final WORKDIR /app +# System libraries required by Chromium on Debian bookworm +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \ + libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \ + libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \ + libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \ + && rm -rf /var/lib/apt/lists/* + +# Copy the Playwright Chromium browser from the build stage +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +COPY --from=build /ms-playwright /ms-playwright + COPY --from=build /app/publish . ENTRYPOINT ["dotnet", "cv-search-job.dll"] diff --git a/Jobs/cv-search-job/Services/HtmlJobSearcher.cs b/Jobs/cv-search-job/Services/HtmlJobSearcher.cs index c99fc62..67ccc0f 100644 --- a/Jobs/cv-search-job/Services/HtmlJobSearcher.cs +++ b/Jobs/cv-search-job/Services/HtmlJobSearcher.cs @@ -1,6 +1,7 @@ using System.Text.RegularExpressions; using System.Web; using CvMatcher.Models.Settings; +using Microsoft.Playwright; using Microsoft.Extensions.Logging; namespace CvSearchJob.Services; @@ -9,6 +10,7 @@ namespace CvSearchJob.Services; /// Config-driven HTML scraper that fetches a provider's job listing page and extracts matching job URLs. /// Uses a two-stage anchor filter: href must contain the provider's link pattern, and anchor text must /// contain at least one CV keyword. +/// Supports both plain HTTP GET (default) and headless Chromium rendering for JS-heavy SPAs. /// public sealed class HtmlJobSearcher { @@ -28,10 +30,6 @@ public sealed class HtmlJobSearcher /// tags, applies the two-stage filter, and returns up to absolute URLs. /// Returns an empty list when the HTTP request fails rather than throwing. /// - /// Provider configuration including search URL template, link filter, and result cap. - /// Keywords extracted from the user's CV to inject into the search query. - /// Cancellation token. - /// Deduplicated list of absolute job page URLs (query string stripped). public async Task> SearchJobUrlsAsync( JobProviderConfig provider, IReadOnlyList cvKeywords, @@ -53,26 +51,25 @@ public sealed class HtmlJobSearcher var searchUrl = provider.SearchUrlTemplate.Replace("{keywords}", keywordsEncoded); _logger.LogInformation( - "Provider {Provider}: fetching {Url} | CV keywords: [{Keywords}]", - provider.Name, searchUrl, string.Join(", ", cvKeywords)); + "Provider {Provider}: fetching {Url} [{Mode}] | CV keywords: [{Keywords}]", + provider.Name, searchUrl, + provider.UseHeadlessBrowser ? "headless" : "http", + string.Join(", ", cvKeywords)); - string html; - try - { - html = await _http.GetStringAsync(searchUrl, ct); - _logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length); - } - catch (Exception ex) - { - _logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", provider.Name, searchUrl); - return []; - } + string? html; + if (provider.UseHeadlessBrowser) + html = await FetchWithPlaywrightAsync(provider.Name, searchUrl, ct); + else + html = await FetchWithHttpAsync(provider.Name, searchUrl, ct); + + if (html is null) return []; + + _logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length); var baseUri = new Uri(searchUrl); var results = new List(); var seen = new HashSet(StringComparer.OrdinalIgnoreCase); - // Match all anchor tags capturing href and inner text var anchorPattern = new Regex(@"]+href=[""']([^""']+)[""'][^>]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline); @@ -92,7 +89,6 @@ public sealed class HtmlJobSearcher stage1Pass++; - // Stage 2: anchor text must contain at least one CV keyword if (!cvKeywords.Any(k => anchorText.Contains(k, StringComparison.OrdinalIgnoreCase))) { _logger.LogDebug( @@ -103,14 +99,12 @@ public sealed class HtmlJobSearcher stage2Pass++; - // Make absolute URL if (!Uri.TryCreate(href, UriKind.Absolute, out var absoluteUri)) { if (!Uri.TryCreate(baseUri, href, out absoluteUri)) continue; } - // Strip query string and fragment so different tracking variants of the same URL collapse to one. var url = absoluteUri.GetLeftPart(UriPartial.Path); if (seen.Add(url)) results.Add(url); @@ -122,4 +116,61 @@ public sealed class HtmlJobSearcher return results; } + + private async Task FetchWithHttpAsync(string providerName, string url, CancellationToken ct) + { + try + { + return await _http.GetStringAsync(url, ct); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", providerName, url); + return null; + } + } + + private async Task FetchWithPlaywrightAsync(string providerName, string url, CancellationToken ct) + { + try + { + using var playwright = await Playwright.CreateAsync(); + await using var browser = await playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions + { + Headless = true, + Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"] + }); + + var page = await browser.NewPageAsync(); + + IResponse? response; + try + { + response = await page.GotoAsync(url, new PageGotoOptions + { + WaitUntil = WaitUntilState.NetworkIdle, + Timeout = 30_000 + }); + } + catch (TimeoutException) + { + // NetworkIdle timed out — use whatever content rendered so far + _logger.LogWarning("Provider {Provider}: Playwright NetworkIdle timeout for {Url}, using partial content", providerName, url); + return await page.ContentAsync(); + } + + if (response is null || response.Status >= 400) + { + _logger.LogWarning("Provider {Provider}: Playwright got HTTP {Status} for {Url}", providerName, response?.Status, url); + return null; + } + + return await page.ContentAsync(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Provider {Provider}: Playwright fetch failed for {Url}", providerName, url); + return null; + } + } } diff --git a/Jobs/cv-search-job/cv-search-job.csproj b/Jobs/cv-search-job/cv-search-job.csproj index 120418e..8a94a55 100644 --- a/Jobs/cv-search-job/cv-search-job.csproj +++ b/Jobs/cv-search-job/cv-search-job.csproj @@ -13,6 +13,7 @@ +