feat(providers): add headless browser scraping via Playwright for SPA job sites

ejobs.ro migrated to a Nuxt SPA - plain HTTP GET returns only the JS bundle. This change equips cv-search-job with a headless Chromium (Playwright 1.60) so it can fully render SPA pages before extracting job links. - Add UseHeadlessBrowser flag to JobProviderEntity, JobProviderConfig, and CvSearchDbContext; map it in JobTokenService.ToConfig so the flag is included in the session provider-config snapshot - Migration: add UseHeadlessBrowser column; fix ejobs.ro search URL (remove /user/ prefix that caused 404) and set UseHeadlessBrowser=true - HtmlJobSearcher: detect flag and dispatch to FetchWithPlaywrightAsync; plain-HTTP path is unchanged; NetworkIdle timeout falls back to partial content rather than failing outright - Dockerfile: download Playwright Chromium in the SDK build stage via npx; copy browser binaries to the final image; install Chromium system libs (Ubuntu noble t64 variants) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 13:42:52 +03:00
parent 209325ace5
commit e38f40732f
11 changed files with 391 additions and 22 deletions
@@ -21,4 +21,6 @@ public sealed class JobProviderConfig
    public string JobLinkContains { get; set; } = string.Empty;
    public List<string> InitialKeywords { get; set; } = [];
    public int MaxResults { get; set; } = 20;
+    /// <summary>When true the scraper uses a headless Chromium browser to render JS-heavy pages.</summary>
+    public bool UseHeadlessBrowser { get; set; }
 }
@@ -125,7 +125,8 @@ public sealed class JobTokenService : IJobTokenService
            SearchUrlTemplate = entity.SearchUrlTemplate,
            JobLinkContains = entity.JobLinkContains,
            InitialKeywords = keywords,
-            MaxResults = entity.MaxResults
+            MaxResults = entity.MaxResults,
+            UseHeadlessBrowser = entity.UseHeadlessBrowser
        };
    }

@@ -79,6 +79,7 @@ public sealed class CvSearchDbContext : DbContext
            entity.Property(x => x.InitialKeywordsJson).HasMaxLength(2000).HasDefaultValue("[]").IsRequired();
            entity.Property(x => x.MaxResults).HasDefaultValue(20);
            entity.Property(x => x.DisplayOrder).HasDefaultValue(0);
+            entity.Property(x => x.UseHeadlessBrowser).HasDefaultValue(false);
        });
    }
 }
@@ -30,4 +30,7 @@ public sealed class JobProviderEntity

    /// <summary>Controls display ordering in future admin UIs.</summary>
    public int DisplayOrder { get; set; }
+
+    /// <summary>When true, the scraper renders the page with headless Chromium instead of a plain HTTP GET.</summary>
+    public bool UseHeadlessBrowser { get; set; }
 }
@@ -0,0 +1,234 @@
+// <auto-generated />
+using System;
+using CvSearch.Data;
+using Microsoft.EntityFrameworkCore;
+using Microsoft.EntityFrameworkCore.Infrastructure;
+using Microsoft.EntityFrameworkCore.Metadata;
+using Microsoft.EntityFrameworkCore.Migrations;
+using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
+
+#nullable disable
+
+namespace CvSearch.Data.Migrations
+{
+    [DbContext(typeof(CvSearchDbContext))]
+    [Migration("20260529170000_AddHeadlessBrowserToProviders")]
+    partial class AddHeadlessBrowserToProviders
+    {
+        /// <inheritdoc />
+        protected override void BuildTargetModel(ModelBuilder modelBuilder)
+        {
+#pragma warning disable 612, 618
+            modelBuilder
+                .HasDefaultSchema("cvSearch")
+                .HasAnnotation("ProductVersion", "10.0.7")
+                .HasAnnotation("Relational:MaxIdentifierLength", 128);
+
+            SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
+
+            modelBuilder.Entity("CvSearch.Data.Entities.JobProviderEntity", b =>
+                {
+                    b.Property<int>("Id")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("int");
+
+                    SqlServerPropertyBuilderExtensions.UseIdentityColumn(b.Property<int>("Id"));
+
+                    b.Property<int>("DisplayOrder")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("int")
+                        .HasDefaultValue(0);
+
+                    b.Property<bool>("Enabled")
+                        .HasColumnType("bit");
+
+                    b.Property<string>("InitialKeywordsJson")
+                        .IsRequired()
+                        .ValueGeneratedOnAdd()
+                        .HasMaxLength(2000)
+                        .HasColumnType("nvarchar(2000)")
+                        .HasDefaultValue("[]");
+
+                    b.Property<string>("JobLinkContains")
+                        .IsRequired()
+                        .HasMaxLength(256)
+                        .HasColumnType("nvarchar(256)");
+
+                    b.Property<int>("MaxResults")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("int")
+                        .HasDefaultValue(20);
+
+                    b.Property<string>("Name")
+                        .IsRequired()
+                        .HasMaxLength(128)
+                        .HasColumnType("nvarchar(128)");
+
+                    b.Property<string>("SearchUrlTemplate")
+                        .IsRequired()
+                        .HasMaxLength(1024)
+                        .HasColumnType("nvarchar(1024)");
+
+                    b.Property<bool>("UseHeadlessBrowser")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("bit")
+                        .HasDefaultValue(false);
+
+                    b.HasKey("Id");
+
+                    b.ToTable("JobProviders", "cvSearch");
+                });
+
+            modelBuilder.Entity("CvSearch.Data.Entities.JobSearchResultEntity", b =>
+                {
+                    b.Property<string>("Id")
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.Property<DateTime>("CreatedAt")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("datetime2")
+                        .HasDefaultValueSql("SYSUTCDATETIME()");
+
+                    b.Property<string>("JobText")
+                        .IsRequired()
+                        .HasColumnType("nvarchar(max)");
+
+                    b.Property<string>("JobTitle")
+                        .IsRequired()
+                        .HasMaxLength(512)
+                        .HasColumnType("nvarchar(512)");
+
+                    b.Property<string>("JobUrl")
+                        .IsRequired()
+                        .HasMaxLength(2048)
+                        .HasColumnType("nvarchar(2048)");
+
+                    b.Property<string>("ProviderName")
+                        .IsRequired()
+                        .HasMaxLength(128)
+                        .HasColumnType("nvarchar(128)");
+
+                    b.Property<string>("ResultJson")
+                        .IsRequired()
+                        .HasColumnType("nvarchar(max)");
+
+                    b.Property<int>("Score")
+                        .HasColumnType("int");
+
+                    b.Property<string>("SessionId")
+                        .IsRequired()
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.HasKey("Id");
+
+                    b.HasIndex("SessionId");
+
+                    b.ToTable("JobSearchResults", "cvSearch");
+                });
+
+            modelBuilder.Entity("CvSearch.Data.Entities.JobSearchSessionEntity", b =>
+                {
+                    b.Property<string>("Id")
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.Property<DateTime>("CreatedAt")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("datetime2")
+                        .HasDefaultValueSql("SYSUTCDATETIME()");
+
+                    b.Property<string>("CvDocumentId")
+                        .IsRequired()
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.Property<string>("Email")
+                        .IsRequired()
+                        .HasMaxLength(256)
+                        .HasColumnType("nvarchar(256)");
+
+                    b.Property<string>("Keywords")
+                        .IsRequired()
+                        .HasMaxLength(1000)
+                        .HasColumnType("nvarchar(1000)");
+
+                    b.Property<string>("Language")
+                        .IsRequired()
+                        .ValueGeneratedOnAdd()
+                        .HasMaxLength(8)
+                        .HasColumnType("nvarchar(8)")
+                        .HasDefaultValue("en");
+
+                    b.Property<string>("ProviderConfigJson")
+                        .HasColumnType("nvarchar(max)");
+
+                    b.Property<string>("Status")
+                        .IsRequired()
+                        .HasMaxLength(32)
+                        .HasColumnType("nvarchar(32)");
+
+                    b.Property<string>("TokenId")
+                        .IsRequired()
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.HasKey("Id");
+
+                    b.HasIndex("Status");
+
+                    b.ToTable("JobSearchSessions", "cvSearch");
+                });
+
+            modelBuilder.Entity("CvSearch.Data.Entities.JobSearchTokenEntity", b =>
+                {
+                    b.Property<string>("Id")
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.Property<DateTime>("CreatedAt")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("datetime2")
+                        .HasDefaultValueSql("SYSUTCDATETIME()");
+
+                    b.Property<string>("CvDocumentId")
+                        .IsRequired()
+                        .HasMaxLength(64)
+                        .HasColumnType("nvarchar(64)");
+
+                    b.Property<string>("Email")
+                        .IsRequired()
+                        .HasMaxLength(256)
+                        .HasColumnType("nvarchar(256)");
+
+                    b.Property<DateTime>("ExpiresAt")
+                        .HasColumnType("datetime2");
+
+                    b.Property<string>("Keywords")
+                        .IsRequired()
+                        .ValueGeneratedOnAdd()
+                        .HasMaxLength(1000)
+                        .HasColumnType("nvarchar(1000)")
+                        .HasDefaultValue("");
+
+                    b.Property<string>("Language")
+                        .IsRequired()
+                        .ValueGeneratedOnAdd()
+                        .HasMaxLength(8)
+                        .HasColumnType("nvarchar(8)")
+                        .HasDefaultValue("en");
+
+                    b.Property<bool>("Used")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("bit")
+                        .HasDefaultValue(false);
+
+                    b.HasKey("Id");
+
+                    b.ToTable("JobSearchTokens", "cvSearch");
+                });
+#pragma warning restore 612, 618
+        }
+    }
+}
@@ -0,0 +1,50 @@
+using Microsoft.EntityFrameworkCore.Migrations;
+
+#nullable disable
+
+namespace CvSearch.Data.Migrations
+{
+    /// <inheritdoc />
+    public partial class AddHeadlessBrowserToProviders : Migration
+    {
+        /// <inheritdoc />
+        protected override void Up(MigrationBuilder migrationBuilder)
+        {
+            migrationBuilder.AddColumn<bool>(
+                name: "UseHeadlessBrowser",
+                schema: MigrationConstants.SchemaName,
+                table: "JobProviders",
+                type: "bit",
+                nullable: false,
+                defaultValue: false);
+
+            // ejobs.ro (Id=1) is a Nuxt SPA — the old /user/ URL 404s and plain HTTP GET
+            // returns only the JS bundle, not actual job listings.
+            // Fix: use the correct search URL and headless Chromium to render job results.
+            migrationBuilder.UpdateData(
+                schema: MigrationConstants.SchemaName,
+                table: "JobProviders",
+                keyColumn: "Id",
+                keyValue: 1,
+                columns: ["SearchUrlTemplate", "JobLinkContains", "UseHeadlessBrowser"],
+                values: new object[] { "https://www.ejobs.ro/locuri-de-munca?q={keywords}", "/locuri-de-munca/", true });
+        }
+
+        /// <inheritdoc />
+        protected override void Down(MigrationBuilder migrationBuilder)
+        {
+            migrationBuilder.UpdateData(
+                schema: MigrationConstants.SchemaName,
+                table: "JobProviders",
+                keyColumn: "Id",
+                keyValue: 1,
+                columns: ["SearchUrlTemplate", "JobLinkContains", "UseHeadlessBrowser"],
+                values: new object[] { "https://www.ejobs.ro/user/locuri-de-munca/?utm_source=myai&q={keywords}", "/user/locuri-de-munca/", false });
+
+            migrationBuilder.DropColumn(
+                name: "UseHeadlessBrowser",
+                schema: MigrationConstants.SchemaName,
+                table: "JobProviders");
+        }
+    }
+}
@@ -66,6 +66,11 @@ namespace CvSearch.Data.Migrations
                        .HasMaxLength(1024)
                        .HasColumnType("nvarchar(1024)");

+                    b.Property<bool>("UseHeadlessBrowser")
+                        .ValueGeneratedOnAdd()
+                        .HasColumnType("bit")
+                        .HasDefaultValue(false);
+
                    b.HasKey("Id");

                    b.ToTable("JobProviders", "cvSearch");
@@ -32,6 +32,8 @@
    <PackageVersion Include="Yarp.ReverseProxy" Version="2.3.0" />
    <PackageVersion Include="MailKit" Version="4.16.0" />
    <PackageVersion Include="PdfPig" Version="0.1.14" />
+    <!-- Browser automation -->
+    <PackageVersion Include="Microsoft.Playwright" Version="1.60.0" />
    <!-- Tooling -->
    <PackageVersion Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.23.0" />
  </ItemGroup>
@@ -29,9 +29,28 @@ COPY Helpers/startup-helpers/ Helpers/startup-helpers/

 RUN dotnet publish Jobs/cv-search-job/cv-search-job.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false

+# Download Playwright Chromium browser in the build stage.
+# Node.js is only needed here to run npx — it is not copied to the final image.
+ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \
+    && npx --yes playwright@1.60.0 install chromium \
+    && rm -rf /var/lib/apt/lists/*
+
 FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final
 WORKDIR /app

+# System libraries required by Chromium on Debian bookworm
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
+    libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
+    libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \
+    libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy the Playwright Chromium browser from the build stage
+ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
+COPY --from=build /ms-playwright /ms-playwright
+
 COPY --from=build /app/publish .

 ENTRYPOINT ["dotnet", "cv-search-job.dll"]
@@ -1,6 +1,7 @@
 using System.Text.RegularExpressions;
 using System.Web;
 using CvMatcher.Models.Settings;
+using Microsoft.Playwright;
 using Microsoft.Extensions.Logging;

 namespace CvSearchJob.Services;
@@ -9,6 +10,7 @@ namespace CvSearchJob.Services;
 /// Config-driven HTML scraper that fetches a provider's job listing page and extracts matching job URLs.
 /// Uses a two-stage anchor filter: href must contain the provider's link pattern, and anchor text must
 /// contain at least one CV keyword.
+/// Supports both plain HTTP GET (default) and headless Chromium rendering for JS-heavy SPAs.
 /// </summary>
 public sealed class HtmlJobSearcher
 {
@@ -28,10 +30,6 @@ public sealed class HtmlJobSearcher
    /// tags, applies the two-stage filter, and returns up to <see cref="JobProviderConfig.MaxResults"/> absolute URLs.
    /// Returns an empty list when the HTTP request fails rather than throwing.
    /// </summary>
-    /// <param name="provider">Provider configuration including search URL template, link filter, and result cap.</param>
-    /// <param name="cvKeywords">Keywords extracted from the user's CV to inject into the search query.</param>
-    /// <param name="ct">Cancellation token.</param>
-    /// <returns>Deduplicated list of absolute job page URLs (query string stripped).</returns>
    public async Task<IReadOnlyList<string>> SearchJobUrlsAsync(
        JobProviderConfig provider,
        IReadOnlyList<string> cvKeywords,
@@ -53,26 +51,25 @@ public sealed class HtmlJobSearcher
        var searchUrl = provider.SearchUrlTemplate.Replace("{keywords}", keywordsEncoded);

        _logger.LogInformation(
-            "Provider {Provider}: fetching {Url} | CV keywords: [{Keywords}]",
-            provider.Name, searchUrl, string.Join(", ", cvKeywords));
+            "Provider {Provider}: fetching {Url} [{Mode}] | CV keywords: [{Keywords}]",
+            provider.Name, searchUrl,
+            provider.UseHeadlessBrowser ? "headless" : "http",
+            string.Join(", ", cvKeywords));

-        string html;
-        try
-        {
-            html = await _http.GetStringAsync(searchUrl, ct);
-            _logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length);
-        }
-        catch (Exception ex)
-        {
-            _logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", provider.Name, searchUrl);
-            return [];
-        }
+        string? html;
+        if (provider.UseHeadlessBrowser)
+            html = await FetchWithPlaywrightAsync(provider.Name, searchUrl, ct);
+        else
+            html = await FetchWithHttpAsync(provider.Name, searchUrl, ct);
+
+        if (html is null) return [];
+
+        _logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length);

        var baseUri = new Uri(searchUrl);
        var results = new List<string>();
        var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);

-        // Match all anchor tags capturing href and inner text
        var anchorPattern = new Regex(@"<a[^>]+href=[""']([^""']+)[""'][^>]*>(.*?)</a>",
            RegexOptions.IgnoreCase | RegexOptions.Singleline);

@@ -92,7 +89,6 @@ public sealed class HtmlJobSearcher

            stage1Pass++;

-            // Stage 2: anchor text must contain at least one CV keyword
            if (!cvKeywords.Any(k => anchorText.Contains(k, StringComparison.OrdinalIgnoreCase)))
            {
                _logger.LogDebug(
@@ -103,14 +99,12 @@ public sealed class HtmlJobSearcher

            stage2Pass++;

-            // Make absolute URL
            if (!Uri.TryCreate(href, UriKind.Absolute, out var absoluteUri))
            {
                if (!Uri.TryCreate(baseUri, href, out absoluteUri))
                    continue;
            }

-            // Strip query string and fragment so different tracking variants of the same URL collapse to one.
            var url = absoluteUri.GetLeftPart(UriPartial.Path);
            if (seen.Add(url))
                results.Add(url);
@@ -122,4 +116,61 @@ public sealed class HtmlJobSearcher

        return results;
    }
+
+    private async Task<string?> FetchWithHttpAsync(string providerName, string url, CancellationToken ct)
+    {
+        try
+        {
+            return await _http.GetStringAsync(url, ct);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", providerName, url);
+            return null;
+        }
+    }
+
+    private async Task<string?> FetchWithPlaywrightAsync(string providerName, string url, CancellationToken ct)
+    {
+        try
+        {
+            using var playwright = await Playwright.CreateAsync();
+            await using var browser = await playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
+            {
+                Headless = true,
+                Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
+            });
+
+            var page = await browser.NewPageAsync();
+
+            IResponse? response;
+            try
+            {
+                response = await page.GotoAsync(url, new PageGotoOptions
+                {
+                    WaitUntil = WaitUntilState.NetworkIdle,
+                    Timeout = 30_000
+                });
+            }
+            catch (TimeoutException)
+            {
+                // NetworkIdle timed out — use whatever content rendered so far
+                _logger.LogWarning("Provider {Provider}: Playwright NetworkIdle timeout for {Url}, using partial content", providerName, url);
+                return await page.ContentAsync();
+            }
+
+            if (response is null || response.Status >= 400)
+            {
+                _logger.LogWarning("Provider {Provider}: Playwright got HTTP {Status} for {Url}", providerName, response?.Status, url);
+                return null;
+            }
+
+            return await page.ContentAsync();
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex, "Provider {Provider}: Playwright fetch failed for {Url}", providerName, url);
+            return null;
+        }
+    }
 }
@@ -13,6 +13,7 @@
    <PackageReference Include="Microsoft.Extensions.Hosting" />
    <PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
    <PackageReference Include="Refit.HttpClientFactory" />
+    <PackageReference Include="Microsoft.Playwright" />
  </ItemGroup>

  <ItemGroup>