Files
myAi/Jobs/cv-search-job/Services/HtmlJobSearcher.cs
T
claude f7d856147e Escalate provider fetch failures to Error for alert emails
HTTP and Playwright fetch failures in HtmlJobSearcher are now logged at
Error so that Serilog's email sink triggers an alert when a job provider
is unreachable. Per-URL match failures remain at Warning (expected noise).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-01 22:03:46 +03:00

177 lines
6.6 KiB
C#

using System.Text.RegularExpressions;
using System.Web;
using CvMatcher.Models.Settings;
using Microsoft.Playwright;
using Microsoft.Extensions.Logging;
namespace CvSearchJob.Services;
/// <summary>
/// Config-driven HTML scraper that fetches a provider's job listing page and extracts matching job URLs.
/// Uses a two-stage anchor filter: href must contain the provider's link pattern, and anchor text must
/// contain at least one CV keyword.
/// Supports both plain HTTP GET (default) and headless Chromium rendering for JS-heavy SPAs.
/// </summary>
public sealed class HtmlJobSearcher
{
private readonly HttpClient _http;
private readonly ILogger<HtmlJobSearcher> _logger;
public HtmlJobSearcher(HttpClient http, ILogger<HtmlJobSearcher> logger)
{
_http = http;
_logger = logger;
_http.Timeout = TimeSpan.FromSeconds(20);
_http.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; MyAi.ro CV-Search/1.0)");
}
/// <summary>
/// Fetches the provider's search result page for the combined initial + CV keywords, parses all anchor
/// tags, applies the two-stage filter, and returns up to <see cref="JobProviderConfig.MaxResults"/> absolute URLs.
/// Returns an empty list when the HTTP request fails rather than throwing.
/// </summary>
public async Task<IReadOnlyList<string>> SearchJobUrlsAsync(
JobProviderConfig provider,
IReadOnlyList<string> cvKeywords,
CancellationToken ct)
{
var allKeywords = provider.InitialKeywords
.Concat(cvKeywords)
.Where(k => !string.IsNullOrWhiteSpace(k))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
if (allKeywords.Count == 0)
{
_logger.LogWarning("Provider {Provider}: no keywords available (CV keywords empty, InitialKeywords empty), skipping", provider.Name);
return [];
}
var keywordsEncoded = HttpUtility.UrlEncode(string.Join(" ", allKeywords));
var searchUrl = provider.SearchUrlTemplate.Replace("{keywords}", keywordsEncoded);
_logger.LogInformation(
"Provider {Provider}: fetching {Url} [{Mode}] | CV keywords: [{Keywords}]",
provider.Name, searchUrl,
provider.UseHeadlessBrowser ? "headless" : "http",
string.Join(", ", cvKeywords));
string? html;
if (provider.UseHeadlessBrowser)
html = await FetchWithPlaywrightAsync(provider.Name, searchUrl, ct);
else
html = await FetchWithHttpAsync(provider.Name, searchUrl, ct);
if (html is null) return [];
_logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length);
var baseUri = new Uri(searchUrl);
var results = new List<string>();
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var anchorPattern = new Regex(@"<a[^>]+href=[""']([^""']+)[""'][^>]*>(.*?)</a>",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
var allAnchors = anchorPattern.Matches(html);
var stage1Pass = 0;
var stage2Pass = 0;
foreach (Match match in allAnchors)
{
if (results.Count >= provider.MaxResults) break;
var href = match.Groups[1].Value.Trim();
var anchorText = Regex.Replace(match.Groups[2].Value, "<[^>]+>", " ").Trim();
if (!href.Contains(provider.JobLinkContains, StringComparison.OrdinalIgnoreCase))
continue;
stage1Pass++;
if (!cvKeywords.Any(k => anchorText.Contains(k, StringComparison.OrdinalIgnoreCase)))
{
_logger.LogDebug(
"Provider {Provider}: stage-2 reject | href={Href} | text={Text}",
provider.Name, href, anchorText.Length > 100 ? anchorText[..100] : anchorText);
continue;
}
stage2Pass++;
if (!Uri.TryCreate(href, UriKind.Absolute, out var absoluteUri))
{
if (!Uri.TryCreate(baseUri, href, out absoluteUri))
continue;
}
var url = absoluteUri.GetLeftPart(UriPartial.Path);
if (seen.Add(url))
results.Add(url);
}
_logger.LogInformation(
"Provider {Provider}: {TotalAnchors} anchors found | {Stage1} passed href filter ('{LinkPattern}') | {Stage2} passed keyword filter | {Unique} unique URLs returned",
provider.Name, allAnchors.Count, stage1Pass, provider.JobLinkContains, stage2Pass, results.Count);
return results;
}
private async Task<string?> FetchWithHttpAsync(string providerName, string url, CancellationToken ct)
{
try
{
return await _http.GetStringAsync(url, ct);
}
catch (Exception ex)
{
_logger.LogError(ex, "Provider {Provider}: HTTP fetch failed for {Url}", providerName, url);
return null;
}
}
private async Task<string?> FetchWithPlaywrightAsync(string providerName, string url, CancellationToken ct)
{
try
{
using var playwright = await Playwright.CreateAsync();
await using var browser = await playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
{
Headless = true,
Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
});
var page = await browser.NewPageAsync();
IResponse? response;
try
{
response = await page.GotoAsync(url, new PageGotoOptions
{
WaitUntil = WaitUntilState.NetworkIdle,
Timeout = 30_000
});
}
catch (TimeoutException)
{
// NetworkIdle timed out — use whatever content rendered so far
_logger.LogWarning("Provider {Provider}: Playwright NetworkIdle timeout for {Url}, using partial content", providerName, url);
return await page.ContentAsync();
}
if (response is null || response.Status >= 400)
{
_logger.LogWarning("Provider {Provider}: Playwright got HTTP {Status} for {Url}", providerName, response?.Status, url);
return null;
}
return await page.ContentAsync();
}
catch (Exception ex)
{
_logger.LogError(ex, "Provider {Provider}: Playwright fetch failed for {Url}", providerName, url);
return null;
}
}
}