feat(providers): add headless browser scraping via Playwright for SPA job sites
Build and Push Docker Images Staging / build (push) Successful in 5m20s
Build and Push Docker Images Staging / build (push) Successful in 5m20s
ejobs.ro migrated to a Nuxt SPA - plain HTTP GET returns only the JS bundle. This change equips cv-search-job with a headless Chromium (Playwright 1.60) so it can fully render SPA pages before extracting job links. - Add UseHeadlessBrowser flag to JobProviderEntity, JobProviderConfig, and CvSearchDbContext; map it in JobTokenService.ToConfig so the flag is included in the session provider-config snapshot - Migration: add UseHeadlessBrowser column; fix ejobs.ro search URL (remove /user/ prefix that caused 404) and set UseHeadlessBrowser=true - HtmlJobSearcher: detect flag and dispatch to FetchWithPlaywrightAsync; plain-HTTP path is unchanged; NetworkIdle timeout falls back to partial content rather than failing outright - Dockerfile: download Playwright Chromium in the SDK build stage via npx; copy browser binaries to the final image; install Chromium system libs (Ubuntu noble t64 variants) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -79,6 +79,7 @@ public sealed class CvSearchDbContext : DbContext
|
||||
entity.Property(x => x.InitialKeywordsJson).HasMaxLength(2000).HasDefaultValue("[]").IsRequired();
|
||||
entity.Property(x => x.MaxResults).HasDefaultValue(20);
|
||||
entity.Property(x => x.DisplayOrder).HasDefaultValue(0);
|
||||
entity.Property(x => x.UseHeadlessBrowser).HasDefaultValue(false);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,4 +30,7 @@ public sealed class JobProviderEntity
|
||||
|
||||
/// <summary>Controls display ordering in future admin UIs.</summary>
|
||||
public int DisplayOrder { get; set; }
|
||||
|
||||
/// <summary>When true, the scraper renders the page with headless Chromium instead of a plain HTTP GET.</summary>
|
||||
public bool UseHeadlessBrowser { get; set; }
|
||||
}
|
||||
|
||||
+234
@@ -0,0 +1,234 @@
|
||||
// <auto-generated />
|
||||
using System;
|
||||
using CvSearch.Data;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.EntityFrameworkCore.Infrastructure;
|
||||
using Microsoft.EntityFrameworkCore.Metadata;
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace CvSearch.Data.Migrations
|
||||
{
|
||||
[DbContext(typeof(CvSearchDbContext))]
|
||||
[Migration("20260529170000_AddHeadlessBrowserToProviders")]
|
||||
partial class AddHeadlessBrowserToProviders
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void BuildTargetModel(ModelBuilder modelBuilder)
|
||||
{
|
||||
#pragma warning disable 612, 618
|
||||
modelBuilder
|
||||
.HasDefaultSchema("cvSearch")
|
||||
.HasAnnotation("ProductVersion", "10.0.7")
|
||||
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
||||
|
||||
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
||||
|
||||
modelBuilder.Entity("CvSearch.Data.Entities.JobProviderEntity", b =>
|
||||
{
|
||||
b.Property<int>("Id")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("int");
|
||||
|
||||
SqlServerPropertyBuilderExtensions.UseIdentityColumn(b.Property<int>("Id"));
|
||||
|
||||
b.Property<int>("DisplayOrder")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("int")
|
||||
.HasDefaultValue(0);
|
||||
|
||||
b.Property<bool>("Enabled")
|
||||
.HasColumnType("bit");
|
||||
|
||||
b.Property<string>("InitialKeywordsJson")
|
||||
.IsRequired()
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasMaxLength(2000)
|
||||
.HasColumnType("nvarchar(2000)")
|
||||
.HasDefaultValue("[]");
|
||||
|
||||
b.Property<string>("JobLinkContains")
|
||||
.IsRequired()
|
||||
.HasMaxLength(256)
|
||||
.HasColumnType("nvarchar(256)");
|
||||
|
||||
b.Property<int>("MaxResults")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("int")
|
||||
.HasDefaultValue(20);
|
||||
|
||||
b.Property<string>("Name")
|
||||
.IsRequired()
|
||||
.HasMaxLength(128)
|
||||
.HasColumnType("nvarchar(128)");
|
||||
|
||||
b.Property<string>("SearchUrlTemplate")
|
||||
.IsRequired()
|
||||
.HasMaxLength(1024)
|
||||
.HasColumnType("nvarchar(1024)");
|
||||
|
||||
b.Property<bool>("UseHeadlessBrowser")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("bit")
|
||||
.HasDefaultValue(false);
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.ToTable("JobProviders", "cvSearch");
|
||||
});
|
||||
|
||||
modelBuilder.Entity("CvSearch.Data.Entities.JobSearchResultEntity", b =>
|
||||
{
|
||||
b.Property<string>("Id")
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<DateTime>("CreatedAt")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("datetime2")
|
||||
.HasDefaultValueSql("SYSUTCDATETIME()");
|
||||
|
||||
b.Property<string>("JobText")
|
||||
.IsRequired()
|
||||
.HasColumnType("nvarchar(max)");
|
||||
|
||||
b.Property<string>("JobTitle")
|
||||
.IsRequired()
|
||||
.HasMaxLength(512)
|
||||
.HasColumnType("nvarchar(512)");
|
||||
|
||||
b.Property<string>("JobUrl")
|
||||
.IsRequired()
|
||||
.HasMaxLength(2048)
|
||||
.HasColumnType("nvarchar(2048)");
|
||||
|
||||
b.Property<string>("ProviderName")
|
||||
.IsRequired()
|
||||
.HasMaxLength(128)
|
||||
.HasColumnType("nvarchar(128)");
|
||||
|
||||
b.Property<string>("ResultJson")
|
||||
.IsRequired()
|
||||
.HasColumnType("nvarchar(max)");
|
||||
|
||||
b.Property<int>("Score")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.Property<string>("SessionId")
|
||||
.IsRequired()
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.HasIndex("SessionId");
|
||||
|
||||
b.ToTable("JobSearchResults", "cvSearch");
|
||||
});
|
||||
|
||||
modelBuilder.Entity("CvSearch.Data.Entities.JobSearchSessionEntity", b =>
|
||||
{
|
||||
b.Property<string>("Id")
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<DateTime>("CreatedAt")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("datetime2")
|
||||
.HasDefaultValueSql("SYSUTCDATETIME()");
|
||||
|
||||
b.Property<string>("CvDocumentId")
|
||||
.IsRequired()
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<string>("Email")
|
||||
.IsRequired()
|
||||
.HasMaxLength(256)
|
||||
.HasColumnType("nvarchar(256)");
|
||||
|
||||
b.Property<string>("Keywords")
|
||||
.IsRequired()
|
||||
.HasMaxLength(1000)
|
||||
.HasColumnType("nvarchar(1000)");
|
||||
|
||||
b.Property<string>("Language")
|
||||
.IsRequired()
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasMaxLength(8)
|
||||
.HasColumnType("nvarchar(8)")
|
||||
.HasDefaultValue("en");
|
||||
|
||||
b.Property<string>("ProviderConfigJson")
|
||||
.HasColumnType("nvarchar(max)");
|
||||
|
||||
b.Property<string>("Status")
|
||||
.IsRequired()
|
||||
.HasMaxLength(32)
|
||||
.HasColumnType("nvarchar(32)");
|
||||
|
||||
b.Property<string>("TokenId")
|
||||
.IsRequired()
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.HasIndex("Status");
|
||||
|
||||
b.ToTable("JobSearchSessions", "cvSearch");
|
||||
});
|
||||
|
||||
modelBuilder.Entity("CvSearch.Data.Entities.JobSearchTokenEntity", b =>
|
||||
{
|
||||
b.Property<string>("Id")
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<DateTime>("CreatedAt")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("datetime2")
|
||||
.HasDefaultValueSql("SYSUTCDATETIME()");
|
||||
|
||||
b.Property<string>("CvDocumentId")
|
||||
.IsRequired()
|
||||
.HasMaxLength(64)
|
||||
.HasColumnType("nvarchar(64)");
|
||||
|
||||
b.Property<string>("Email")
|
||||
.IsRequired()
|
||||
.HasMaxLength(256)
|
||||
.HasColumnType("nvarchar(256)");
|
||||
|
||||
b.Property<DateTime>("ExpiresAt")
|
||||
.HasColumnType("datetime2");
|
||||
|
||||
b.Property<string>("Keywords")
|
||||
.IsRequired()
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasMaxLength(1000)
|
||||
.HasColumnType("nvarchar(1000)")
|
||||
.HasDefaultValue("");
|
||||
|
||||
b.Property<string>("Language")
|
||||
.IsRequired()
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasMaxLength(8)
|
||||
.HasColumnType("nvarchar(8)")
|
||||
.HasDefaultValue("en");
|
||||
|
||||
b.Property<bool>("Used")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("bit")
|
||||
.HasDefaultValue(false);
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.ToTable("JobSearchTokens", "cvSearch");
|
||||
});
|
||||
#pragma warning restore 612, 618
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace CvSearch.Data.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class AddHeadlessBrowserToProviders : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.AddColumn<bool>(
|
||||
name: "UseHeadlessBrowser",
|
||||
schema: MigrationConstants.SchemaName,
|
||||
table: "JobProviders",
|
||||
type: "bit",
|
||||
nullable: false,
|
||||
defaultValue: false);
|
||||
|
||||
// ejobs.ro (Id=1) is a Nuxt SPA — the old /user/ URL 404s and plain HTTP GET
|
||||
// returns only the JS bundle, not actual job listings.
|
||||
// Fix: use the correct search URL and headless Chromium to render job results.
|
||||
migrationBuilder.UpdateData(
|
||||
schema: MigrationConstants.SchemaName,
|
||||
table: "JobProviders",
|
||||
keyColumn: "Id",
|
||||
keyValue: 1,
|
||||
columns: ["SearchUrlTemplate", "JobLinkContains", "UseHeadlessBrowser"],
|
||||
values: new object[] { "https://www.ejobs.ro/locuri-de-munca?q={keywords}", "/locuri-de-munca/", true });
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.UpdateData(
|
||||
schema: MigrationConstants.SchemaName,
|
||||
table: "JobProviders",
|
||||
keyColumn: "Id",
|
||||
keyValue: 1,
|
||||
columns: ["SearchUrlTemplate", "JobLinkContains", "UseHeadlessBrowser"],
|
||||
values: new object[] { "https://www.ejobs.ro/user/locuri-de-munca/?utm_source=myai&q={keywords}", "/user/locuri-de-munca/", false });
|
||||
|
||||
migrationBuilder.DropColumn(
|
||||
name: "UseHeadlessBrowser",
|
||||
schema: MigrationConstants.SchemaName,
|
||||
table: "JobProviders");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -66,6 +66,11 @@ namespace CvSearch.Data.Migrations
|
||||
.HasMaxLength(1024)
|
||||
.HasColumnType("nvarchar(1024)");
|
||||
|
||||
b.Property<bool>("UseHeadlessBrowser")
|
||||
.ValueGeneratedOnAdd()
|
||||
.HasColumnType("bit")
|
||||
.HasDefaultValue(false);
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.ToTable("JobProviders", "cvSearch");
|
||||
|
||||
Reference in New Issue
Block a user