feat(providers): add headless browser scraping via Playwright for SPA job sites
Build and Push Docker Images Staging / build (push) Successful in 5m20s
Build and Push Docker Images Staging / build (push) Successful in 5m20s
ejobs.ro migrated to a Nuxt SPA - plain HTTP GET returns only the JS bundle. This change equips cv-search-job with a headless Chromium (Playwright 1.60) so it can fully render SPA pages before extracting job links. - Add UseHeadlessBrowser flag to JobProviderEntity, JobProviderConfig, and CvSearchDbContext; map it in JobTokenService.ToConfig so the flag is included in the session provider-config snapshot - Migration: add UseHeadlessBrowser column; fix ejobs.ro search URL (remove /user/ prefix that caused 404) and set UseHeadlessBrowser=true - HtmlJobSearcher: detect flag and dispatch to FetchWithPlaywrightAsync; plain-HTTP path is unchanged; NetworkIdle timeout falls back to partial content rather than failing outright - Dockerfile: download Playwright Chromium in the SDK build stage via npx; copy browser binaries to the final image; install Chromium system libs (Ubuntu noble t64 variants) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -29,9 +29,28 @@ COPY Helpers/startup-helpers/ Helpers/startup-helpers/
|
||||
|
||||
RUN dotnet publish Jobs/cv-search-job/cv-search-job.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
|
||||
|
||||
# Download Playwright Chromium browser in the build stage.
|
||||
# Node.js is only needed here to run npx — it is not copied to the final image.
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \
|
||||
&& npx --yes playwright@1.60.0 install chromium \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final
|
||||
WORKDIR /app
|
||||
|
||||
# System libraries required by Chromium on Debian bookworm
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
|
||||
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
|
||||
libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \
|
||||
libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy the Playwright Chromium browser from the build stage
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
COPY --from=build /ms-playwright /ms-playwright
|
||||
|
||||
COPY --from=build /app/publish .
|
||||
|
||||
ENTRYPOINT ["dotnet", "cv-search-job.dll"]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Web;
|
||||
using CvMatcher.Models.Settings;
|
||||
using Microsoft.Playwright;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace CvSearchJob.Services;
|
||||
@@ -9,6 +10,7 @@ namespace CvSearchJob.Services;
|
||||
/// Config-driven HTML scraper that fetches a provider's job listing page and extracts matching job URLs.
|
||||
/// Uses a two-stage anchor filter: href must contain the provider's link pattern, and anchor text must
|
||||
/// contain at least one CV keyword.
|
||||
/// Supports both plain HTTP GET (default) and headless Chromium rendering for JS-heavy SPAs.
|
||||
/// </summary>
|
||||
public sealed class HtmlJobSearcher
|
||||
{
|
||||
@@ -28,10 +30,6 @@ public sealed class HtmlJobSearcher
|
||||
/// tags, applies the two-stage filter, and returns up to <see cref="JobProviderConfig.MaxResults"/> absolute URLs.
|
||||
/// Returns an empty list when the HTTP request fails rather than throwing.
|
||||
/// </summary>
|
||||
/// <param name="provider">Provider configuration including search URL template, link filter, and result cap.</param>
|
||||
/// <param name="cvKeywords">Keywords extracted from the user's CV to inject into the search query.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Deduplicated list of absolute job page URLs (query string stripped).</returns>
|
||||
public async Task<IReadOnlyList<string>> SearchJobUrlsAsync(
|
||||
JobProviderConfig provider,
|
||||
IReadOnlyList<string> cvKeywords,
|
||||
@@ -53,26 +51,25 @@ public sealed class HtmlJobSearcher
|
||||
var searchUrl = provider.SearchUrlTemplate.Replace("{keywords}", keywordsEncoded);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Provider {Provider}: fetching {Url} | CV keywords: [{Keywords}]",
|
||||
provider.Name, searchUrl, string.Join(", ", cvKeywords));
|
||||
"Provider {Provider}: fetching {Url} [{Mode}] | CV keywords: [{Keywords}]",
|
||||
provider.Name, searchUrl,
|
||||
provider.UseHeadlessBrowser ? "headless" : "http",
|
||||
string.Join(", ", cvKeywords));
|
||||
|
||||
string html;
|
||||
try
|
||||
{
|
||||
html = await _http.GetStringAsync(searchUrl, ct);
|
||||
_logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", provider.Name, searchUrl);
|
||||
return [];
|
||||
}
|
||||
string? html;
|
||||
if (provider.UseHeadlessBrowser)
|
||||
html = await FetchWithPlaywrightAsync(provider.Name, searchUrl, ct);
|
||||
else
|
||||
html = await FetchWithHttpAsync(provider.Name, searchUrl, ct);
|
||||
|
||||
if (html is null) return [];
|
||||
|
||||
_logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length);
|
||||
|
||||
var baseUri = new Uri(searchUrl);
|
||||
var results = new List<string>();
|
||||
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Match all anchor tags capturing href and inner text
|
||||
var anchorPattern = new Regex(@"<a[^>]+href=[""']([^""']+)[""'][^>]*>(.*?)</a>",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
||||
|
||||
@@ -92,7 +89,6 @@ public sealed class HtmlJobSearcher
|
||||
|
||||
stage1Pass++;
|
||||
|
||||
// Stage 2: anchor text must contain at least one CV keyword
|
||||
if (!cvKeywords.Any(k => anchorText.Contains(k, StringComparison.OrdinalIgnoreCase)))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
@@ -103,14 +99,12 @@ public sealed class HtmlJobSearcher
|
||||
|
||||
stage2Pass++;
|
||||
|
||||
// Make absolute URL
|
||||
if (!Uri.TryCreate(href, UriKind.Absolute, out var absoluteUri))
|
||||
{
|
||||
if (!Uri.TryCreate(baseUri, href, out absoluteUri))
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strip query string and fragment so different tracking variants of the same URL collapse to one.
|
||||
var url = absoluteUri.GetLeftPart(UriPartial.Path);
|
||||
if (seen.Add(url))
|
||||
results.Add(url);
|
||||
@@ -122,4 +116,61 @@ public sealed class HtmlJobSearcher
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task<string?> FetchWithHttpAsync(string providerName, string url, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
return await _http.GetStringAsync(url, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Provider {Provider}: HTTP fetch failed for {Url}", providerName, url);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<string?> FetchWithPlaywrightAsync(string providerName, string url, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var playwright = await Playwright.CreateAsync();
|
||||
await using var browser = await playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
|
||||
{
|
||||
Headless = true,
|
||||
Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
|
||||
});
|
||||
|
||||
var page = await browser.NewPageAsync();
|
||||
|
||||
IResponse? response;
|
||||
try
|
||||
{
|
||||
response = await page.GotoAsync(url, new PageGotoOptions
|
||||
{
|
||||
WaitUntil = WaitUntilState.NetworkIdle,
|
||||
Timeout = 30_000
|
||||
});
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
// NetworkIdle timed out — use whatever content rendered so far
|
||||
_logger.LogWarning("Provider {Provider}: Playwright NetworkIdle timeout for {Url}, using partial content", providerName, url);
|
||||
return await page.ContentAsync();
|
||||
}
|
||||
|
||||
if (response is null || response.Status >= 400)
|
||||
{
|
||||
_logger.LogWarning("Provider {Provider}: Playwright got HTTP {Status} for {Url}", providerName, response?.Status, url);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await page.ContentAsync();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Provider {Provider}: Playwright fetch failed for {Url}", providerName, url);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting" />
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
|
||||
<PackageReference Include="Refit.HttpClientFactory" />
|
||||
<PackageReference Include="Microsoft.Playwright" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
Reference in New Issue
Block a user