Merge branch 'main' into staging
Build and Push Docker Images Staging / build (push) Successful in 28m0s

This commit is contained in:
2026-06-08 18:37:00 +03:00
44 changed files with 1148 additions and 129 deletions
+10 -1
View File
@@ -15,6 +15,7 @@ env:
WEB_IMAGE: apps/myai-web
CV_CLEANUP_JOB_IMAGE: apps/myai-cv-cleanup-job
CV_SEARCH_JOB_IMAGE: apps/myai-cv-search-job
PAGE_FETCHER_API_IMAGE: apps/myai-page-fetcher-api
IMAGE_TAG: staging
jobs:
@@ -62,6 +63,10 @@ jobs:
run: |
docker build -f Jobs/cv-search-job/Dockerfile -t "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}" .
- name: Build Page Fetcher API image
run: |
docker build -f Apis/page-fetcher-api/Dockerfile -t "${REGISTRY_HOST}/${PAGE_FETCHER_API_IMAGE}:${IMAGE_TAG}" .
- name: Push API image
run: |
docker push "${REGISTRY_HOST}/${API_IMAGE}:${IMAGE_TAG}"
@@ -88,4 +93,8 @@ jobs:
- name: Push CV search job image
run: |
docker push "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}"
docker push "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}"
- name: Push Page Fetcher API image
run: |
docker push "${REGISTRY_HOST}/${PAGE_FETCHER_API_IMAGE}:${IMAGE_TAG}"
+2 -2
View File
@@ -5,8 +5,8 @@ using Email.Data;
using Email.Data.Repositories;
using Email.Data.Repositories.Contracts;
using Email.Data.Services;
using EmailApi.Models.Clients;
using EmailApi.Models.Settings;
using Email.Models.Clients;
using Email.Models.Settings;
using Microsoft.EntityFrameworkCore;
using Models.Settings;
using MyAi.Data;
+2 -2
View File
@@ -1,8 +1,8 @@
using Api.Services.Contracts;
using CvMatcher.Models.Responses;
using Email.Data.Services;
using EmailApi.Models.Clients;
using EmailApi.Models.Requests;
using Email.Models.Clients;
using Email.Models.Requests;
using Microsoft.Extensions.Options;
using Models.Requests;
using Models.Settings;
@@ -0,0 +1,11 @@
namespace Common.Settings;
/// <summary>
/// Connection settings for the internal page-fetcher-api service.
/// Bound from the <c>PageFetcherApi</c> configuration section.
/// </summary>
public sealed class PageFetcherApiSettings
{
public string BaseUrl { get; set; } = string.Empty;
public string InternalApiKey { get; set; } = string.Empty;
}
+2
View File
@@ -8,6 +8,7 @@ COPY Apis/cv-search-data/cv-search-data.csproj Apis/cv-search-data/
COPY Apis/cv-matcher-data/cv-matcher-data.csproj Apis/cv-matcher-data/
COPY Apis/common/common.csproj Apis/common/
COPY Apis/cv-matcher-api-models/cv-matcher-api-models.csproj Apis/cv-matcher-api-models/
COPY Apis/page-fetcher-api-models/page-fetcher-api-models.csproj Apis/page-fetcher-api-models/
COPY Apis/myai-data/myai-data.csproj Apis/myai-data/
COPY Apis/shared-data/shared-data.csproj Apis/shared-data/
COPY Helpers/common-helpers/common-helpers.csproj Helpers/common-helpers/
@@ -20,6 +21,7 @@ COPY Apis/cv-search-data/ Apis/cv-search-data/
COPY Apis/cv-matcher-data/ Apis/cv-matcher-data/
COPY Apis/common/ Apis/common/
COPY Apis/cv-matcher-api-models/ Apis/cv-matcher-api-models/
COPY Apis/page-fetcher-api-models/ Apis/page-fetcher-api-models/
COPY Apis/myai-data/ Apis/myai-data/
COPY Apis/shared-data/ Apis/shared-data/
COPY Helpers/common-helpers/ Helpers/common-helpers/
+12 -1
View File
@@ -13,6 +13,7 @@ using Microsoft.EntityFrameworkCore;
using Refit;
using Serilog;
using Common.Settings;
using PageFetcher.Models;
using StartupHelpers;
using System.Reflection;
@@ -36,6 +37,16 @@ try
builder.Services.Configure<CvMatcher.Models.Settings.AiSettings>(builder.Configuration.GetSection("Ai"));
builder.Services.Configure<MatcherSettings>(builder.Configuration.GetSection("Matcher"));
builder.Services.Configure<JobSearchSettings>(builder.Configuration.GetSection("JobSearch"));
builder.Services.Configure<PageFetcherApiSettings>(builder.Configuration.GetSection("PageFetcherApi"));
builder.Services.AddRefitClient<IPageFetcherApiClient>()
.ConfigureHttpClient((sp, c) =>
{
var settings = sp.GetRequiredService<Microsoft.Extensions.Options.IOptions<PageFetcherApiSettings>>().Value;
c.BaseAddress = new Uri(settings.BaseUrl.TrimEnd('/') + "/");
if (!string.IsNullOrWhiteSpace(settings.InternalApiKey))
c.DefaultRequestHeaders.Add("X-Internal-Api-Key", settings.InternalApiKey);
});
builder.Services.AddRefitClient<IRefitRagApi>()
.ConfigureHttpClient((sp, c) =>
@@ -50,7 +61,7 @@ try
builder.Services.AddScoped<IRagApiClient, RagApiClient>();
builder.Services.AddHttpClient<IMatcherAiClient, MatcherAiClient>();
builder.Services.AddHttpClient<IJobTextExtractor, JobTextExtractor>();
builder.Services.AddScoped<IJobTextExtractor, JobTextExtractor>();
builder.Services.AddDbContext<CvMatcherDbContext>(options =>
{
@@ -1,26 +1,23 @@
using System.Net;
using System.Text.RegularExpressions;
using CvMatcher.Models.Settings;
using Api.Services.Contracts;
using Microsoft.Extensions.Options;
using PageFetcher.Models;
namespace Api.Services;
/// <summary>
/// Extracts normalised plain text from a job posting, either from a pasted description or by
/// fetching and stripping the HTML of the job page URL.
/// fetching the job page text via <c>page-fetcher-api</c> (headless Chromium rendering).
/// </summary>
public sealed class JobTextExtractor : IJobTextExtractor
{
private readonly HttpClient _http;
private readonly IPageFetcherApiClient _pageFetcher;
private readonly MatcherSettings _settings;
public JobTextExtractor(HttpClient http, IOptions<MatcherSettings> options)
public JobTextExtractor(IPageFetcherApiClient pageFetcher, IOptions<MatcherSettings> options)
{
_http = http;
_pageFetcher = pageFetcher;
_settings = options.Value;
_http.Timeout = TimeSpan.FromSeconds(25);
_http.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0");
}
/// <inheritdoc />
@@ -31,15 +28,18 @@ public sealed class JobTextExtractor : IJobTextExtractor
if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty;
if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || uri.Scheme is not ("http" or "https"))
{
throw new InvalidOperationException("Invalid job URL.");
}
var html = await _http.GetStringAsync(uri, ct);
html = Regex.Replace(html, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<[^>]+>", " ");
return Limit(Normalize(WebUtility.HtmlDecode(html)));
var response = await _pageFetcher.FetchAsync(new FetchPageRequest
{
Url = jobUrl,
CallerService = "cv-matcher-api"
}, ct);
if (!response.Success)
throw new InvalidOperationException($"Failed to fetch job page: {response.Error}");
return Limit(Normalize(response.Text));
}
/// <summary>Truncates text to the configured maximum character count.</summary>
@@ -82,6 +82,7 @@
<ProjectReference Include="..\cv-search-data\cv-search-data.csproj" />
<ProjectReference Include="..\cv-matcher-data\cv-matcher-data.csproj" />
<ProjectReference Include="..\common\common.csproj" />
<ProjectReference Include="..\page-fetcher-api-models\page-fetcher-api-models.csproj" />
<ProjectReference Include="..\..\Helpers\startup-helpers\startup-helpers.csproj" />
</ItemGroup>
</Project>
@@ -1,7 +1,7 @@
using EmailApi.Models.Requests;
using Email.Models.Requests;
using Refit;
namespace EmailApi.Models.Clients;
namespace Email.Models.Clients;
public interface IEmailApiClient
{
@@ -1,4 +1,4 @@
namespace EmailApi.Models.Requests;
namespace Email.Models.Requests;
public sealed class SendEmailRequest
{
@@ -1,4 +1,4 @@
namespace EmailApi.Models.Settings;
namespace Email.Models.Settings;
public sealed class EmailApiSettings
{
@@ -1,4 +1,4 @@
namespace Models.Settings;
namespace Email.Models.Settings;
public sealed class SmtpSettings
{
@@ -1,9 +1,9 @@
using EmailApi.Models.Requests;
using EmailApi.Services;
using Api.Services;
using Email.Models.Requests;
using Microsoft.AspNetCore.Mvc;
using Swashbuckle.AspNetCore.Annotations;
namespace EmailApi.Controllers;
namespace Api.Controllers;
/// <summary>
/// Internal email relay. Accepts an HTML body fragment from trusted callers
+2 -1
View File
@@ -3,8 +3,9 @@ using Email.Data;
using Email.Data.Repositories;
using Email.Data.Repositories.Contracts;
using Email.Data.Services;
using EmailApi.Services;
using Api.Services;
using Microsoft.EntityFrameworkCore;
using Email.Models.Settings;
using Models.Settings;
using Serilog;
using StartupHelpers;
@@ -1,12 +1,13 @@
using Email.Data.Services;
using EmailApi.Models.Requests;
using Email.Models.Requests;
using MailKit.Net.Smtp;
using MailKit.Security;
using Microsoft.Extensions.Options;
using MimeKit;
using Email.Models.Settings;
using Models.Settings;
namespace EmailApi.Services;
namespace Api.Services;
/// <summary>
/// Wraps an HTML body fragment in the branded HTML shell and sends the resulting email via SMTP using MailKit.
@@ -0,0 +1,20 @@
namespace PageFetcher.Models;
/// <summary>
/// Request to fetch a web page via the page-fetcher-api.
/// </summary>
public sealed class FetchPageRequest
{
/// <summary>Absolute HTTP or HTTPS URL to fetch.</summary>
public string Url { get; set; } = string.Empty;
/// <summary>
/// Playwright wait condition. Accepted values: <c>networkidle</c> (default), <c>domcontentloaded</c>, <c>load</c>.
/// </summary>
public string WaitFor { get; set; } = "networkidle";
/// <summary>
/// Identifies the calling service for audit purposes (e.g. <c>cv-matcher-api</c>, <c>cv-search-job</c>).
/// </summary>
public string CallerService { get; set; } = string.Empty;
}
@@ -0,0 +1,25 @@
namespace PageFetcher.Models;
/// <summary>
/// Result of a page fetch operation.
/// </summary>
public sealed class FetchPageResponse
{
/// <summary>Final URL after any redirects.</summary>
public string Url { get; set; } = string.Empty;
/// <summary>HTTP status code returned by the page. <c>0</c> on network failure.</summary>
public int StatusCode { get; set; }
/// <summary>Full rendered HTML as returned by Playwright.</summary>
public string Html { get; set; } = string.Empty;
/// <summary>Plain text extracted from the HTML (script/style stripped, whitespace normalised).</summary>
public string Text { get; set; } = string.Empty;
/// <summary>Whether the fetch succeeded. <c>false</c> on timeout or network error.</summary>
public bool Success { get; set; }
/// <summary>Exception message when <see cref="Success"/> is <c>false</c>.</summary>
public string? Error { get; set; }
}
@@ -0,0 +1,16 @@
using Refit;
namespace PageFetcher.Models;
/// <summary>
/// Refit client for the internal page-fetcher-api service.
/// All calls require the <c>X-Internal-Api-Key</c> header, configured at registration time.
/// </summary>
public interface IPageFetcherApiClient
{
/// <summary>
/// Fetches a web page via headless Chromium and returns the rendered HTML and extracted plain text.
/// </summary>
[Post("/api/page/fetch")]
Task<FetchPageResponse> FetchAsync([Body] FetchPageRequest request, CancellationToken ct = default);
}
@@ -0,0 +1,17 @@
namespace PageFetcher.Models.Settings;
/// <summary>
/// Runtime settings for the page-fetcher service.
/// Bound from the <c>PageFetcher</c> configuration section.
/// </summary>
public sealed class PageFetcherSettings
{
/// <summary>Default Playwright wait condition (<c>networkidle</c>, <c>load</c>, <c>domcontentloaded</c>).</summary>
public string DefaultWaitFor { get; set; } = "networkidle";
/// <summary>Page navigation timeout in seconds.</summary>
public int TimeoutSeconds { get; set; } = 30;
/// <summary>Maximum characters stored/returned in the extracted text field.</summary>
public int MaxTextChars { get; set; } = 60_000;
}
@@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AssemblyName>page-fetcher-api-models</AssemblyName>
<RootNamespace>PageFetcher.Models</RootNamespace>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Refit.HttpClientFactory" />
</ItemGroup>
</Project>
@@ -0,0 +1,47 @@
using Api.Services;
using Microsoft.AspNetCore.Mvc;
using PageFetcher.Models;
using Swashbuckle.AspNetCore.Annotations;
namespace Api.Controllers;
/// <summary>
/// Handles page-fetch requests: navigates to the URL via Playwright and returns rendered HTML and extracted text.
/// </summary>
[ApiController]
[Route("api/page")]
public sealed class PageController : ControllerBase
{
private readonly PageFetcherService _service;
private readonly ILogger<PageController> _logger;
public PageController(PageFetcherService service, ILogger<PageController> logger)
{
_service = service;
_logger = logger;
}
/// <summary>
/// Fetches a web page via headless Chromium.
/// Returns rendered HTML and extracted plain text.
/// </summary>
[HttpPost("fetch")]
[SwaggerOperation(Summary = "Fetch a web page", Description = "Navigates to the given URL using Playwright, returns rendered HTML and stripped plain text.")]
[SwaggerResponse(StatusCodes.Status200OK, "Page fetched successfully", typeof(FetchPageResponse))]
[SwaggerResponse(StatusCodes.Status400BadRequest, "Invalid or non-HTTP(S) URL")]
public async Task<ActionResult<FetchPageResponse>> Fetch([FromBody] FetchPageRequest request, CancellationToken ct)
{
if (string.IsNullOrWhiteSpace(request.Url))
return BadRequest(new { Error = "Url is required." });
if (!Uri.TryCreate(request.Url, UriKind.Absolute, out var uri) ||
(uri.Scheme != Uri.UriSchemeHttp && uri.Scheme != Uri.UriSchemeHttps))
return BadRequest(new { Error = "Url must be an absolute HTTP or HTTPS URL." });
_logger.LogInformation("Fetch request: {Url} | caller={Caller} | waitFor={WaitFor}",
request.Url, request.CallerService, request.WaitFor);
var result = await _service.FetchAsync(request, ct);
return Ok(result);
}
}
+50
View File
@@ -0,0 +1,50 @@
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
ARG BUILD_CONFIGURATION=Release
WORKDIR /src
COPY Directory.Packages.props ./
COPY Apis/page-fetcher-api/page-fetcher-api.csproj Apis/page-fetcher-api/
COPY Apis/page-fetcher-data/page-fetcher-data.csproj Apis/page-fetcher-data/
COPY Apis/page-fetcher-api-models/page-fetcher-api-models.csproj Apis/page-fetcher-api-models/
COPY Apis/common/common.csproj Apis/common/
COPY Apis/shared-data/shared-data.csproj Apis/shared-data/
COPY Helpers/startup-helpers/startup-helpers.csproj Helpers/startup-helpers/
COPY Helpers/common-helpers/common-helpers.csproj Helpers/common-helpers/
RUN dotnet restore Apis/page-fetcher-api/page-fetcher-api.csproj
COPY Apis/page-fetcher-api/ Apis/page-fetcher-api/
COPY Apis/page-fetcher-data/ Apis/page-fetcher-data/
COPY Apis/page-fetcher-api-models/ Apis/page-fetcher-api-models/
COPY Apis/common/ Apis/common/
COPY Apis/shared-data/ Apis/shared-data/
COPY Helpers/startup-helpers/ Helpers/startup-helpers/
COPY Helpers/common-helpers/ Helpers/common-helpers/
RUN dotnet publish Apis/page-fetcher-api/page-fetcher-api.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
# Download Playwright Chromium browser in the build stage.
# Node.js is only needed here to run npx — it is not copied to the final image.
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \
&& npx --yes playwright@1.60.0 install chromium \
&& rm -rf /var/lib/apt/lists/*
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final
WORKDIR /app
# System libraries required by Chromium on Debian bookworm
RUN apt-get update && apt-get install -y --no-install-recommends \
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \
libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \
&& rm -rf /var/lib/apt/lists/*
# Copy the Playwright Chromium browser from the build stage
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
COPY --from=build /ms-playwright /ms-playwright
COPY --from=build /app/publish .
ENTRYPOINT ["dotnet", "page-fetcher-api.dll"]
+75
View File
@@ -0,0 +1,75 @@
using System.Reflection;
using Microsoft.EntityFrameworkCore;
using PageFetcher.Data;
using Api.Services;
using PageFetcher.Models.Settings;
using Serilog;
using StartupHelpers;
StartupExtensions.LoadDotEnvFile();
const string ServiceName = "page-fetcher-api";
var appVersion = StartupExtensions.GetApplicationVersion(Assembly.GetExecutingAssembly());
try
{
var builder = WebApplication.CreateBuilder(args);
builder.ConfigureJsonSerilog(ServiceName, appVersion);
Log.Information("Starting {Service} version {AppVersion}", ServiceName, appVersion);
builder.AddAzureKeyVaultIfConfigured();
builder.Services.Configure<PageFetcherSettings>(builder.Configuration.GetSection("PageFetcher"));
builder.Services.AddDbContext<PageFetchDbContext>(options =>
{
var connectionString = builder.Services.GetConfiguredDbConnectionString(builder.Configuration);
options.UseSqlServer(connectionString, sql =>
{
sql.MigrationsHistoryTable(PageFetchDbContext.MigrationTableName, PageFetchDbContext.SchemaName);
sql.MigrationsAssembly("page-fetcher-data");
});
});
// Playwright browser: singleton hosted service, shared across all requests
builder.Services.AddSingleton<PlaywrightBrowserService>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<PlaywrightBrowserService>());
builder.Services.AddScoped<PageFetcherService>();
builder.Services.AddControllers();
builder.Services.AddSwaggerWithXmlComments(Assembly.GetExecutingAssembly(), "Page Fetcher API");
var app = builder.Build();
app.LogStartupDiagnostics(ServiceName);
app.UseDefaultSerilogRequestLogging();
app.UseJsonExceptionHandler(ServiceName);
app.UseInternalApiKeyProtection();
app.UseSwaggerInDevelopment("Page Fetcher API", "PageFetcherAPI");
app.UseRouting();
app.UseAuthorization();
app.MapControllers();
Log.Information("Running EF Core migrations if any");
using (var scope = app.Services.CreateScope())
{
var db = scope.ServiceProvider.GetRequiredService<PageFetchDbContext>();
db.Database.Migrate();
}
Log.Information("{Service} startup complete. Listening for requests...", ServiceName);
app.Run();
}
catch (Exception ex)
{
Log.Fatal(ex, "{Service} terminated unexpectedly", ServiceName);
}
finally
{
Log.Information("Shutting down {Service}", ServiceName);
Log.CloseAndFlush();
}
@@ -0,0 +1,12 @@
{
"profiles": {
"page-fetcher-api": {
"commandName": "Project",
"launchBrowser": true,
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
},
"applicationUrl": "https://localhost:50268;http://localhost:50269"
}
}
}
@@ -0,0 +1,144 @@
using System.Diagnostics;
using System.Net;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Options;
using Microsoft.Playwright;
using PageFetcher.Data;
using PageFetcher.Data.Entities;
using PageFetcher.Models;
using PageFetcher.Models.Settings;
namespace Api.Services;
/// <summary>
/// Fetches a web page via Playwright, extracts plain text, persists the result to the database,
/// and returns a <see cref="FetchPageResponse"/>.
/// </summary>
public sealed class PageFetcherService
{
private readonly PlaywrightBrowserService _browserService;
private readonly PageFetchDbContext _db;
private readonly PageFetcherSettings _settings;
private readonly ILogger<PageFetcherService> _logger;
public PageFetcherService(
PlaywrightBrowserService browserService,
PageFetchDbContext db,
IOptions<PageFetcherSettings> settings,
ILogger<PageFetcherService> logger)
{
_browserService = browserService;
_db = db;
_settings = settings.Value;
_logger = logger;
}
/// <summary>
/// Fetches the page at <paramref name="request.Url"/> using Playwright, saves the fetch record,
/// and returns the HTML and extracted text.
/// Returns a failed response (with <see cref="FetchPageResponse.Success"/> = false) rather than throwing
/// on network or navigation errors.
/// </summary>
public async Task<FetchPageResponse> FetchAsync(FetchPageRequest request, CancellationToken ct)
{
var sw = Stopwatch.StartNew();
string html = string.Empty;
string text = string.Empty;
int? statusCode = null;
bool success = false;
string? errorMessage = null;
string finalUrl = request.Url;
try
{
var page = await _browserService.Browser.NewPageAsync();
await using var _ = page.ConfigureAwait(false);
var waitUntil = request.WaitFor?.ToLowerInvariant() switch
{
"load" => WaitUntilState.Load,
"domcontentloaded" => WaitUntilState.DOMContentLoaded,
_ => WaitUntilState.NetworkIdle
};
IResponse? response;
try
{
response = await page.GotoAsync(request.Url, new PageGotoOptions
{
WaitUntil = waitUntil,
Timeout = _settings.TimeoutSeconds * 1_000
});
}
catch (TimeoutException)
{
_logger.LogWarning("Playwright NetworkIdle timeout for {Url}, using partial content", request.Url);
response = null;
}
statusCode = response?.Status;
finalUrl = page.Url;
html = await page.ContentAsync();
text = ExtractText(html);
success = true;
_logger.LogInformation("Fetched {Url} → HTTP {Status} | HTML {HtmlLen} chars | text {TextLen} chars | {DurationMs} ms",
request.Url, statusCode?.ToString() ?? "timeout", html.Length, text.Length, sw.ElapsedMilliseconds);
}
catch (Exception ex)
{
errorMessage = ex.Message;
_logger.LogError(ex, "Failed to fetch {Url}", request.Url);
}
finally
{
sw.Stop();
}
// Persist fetch record
var entity = new PageFetchEntity
{
Id = Guid.NewGuid().ToString("N"),
Url = request.Url,
CallerService = request.CallerService ?? string.Empty,
HttpStatusCode = statusCode,
Html = html,
Text = text,
DurationMs = sw.ElapsedMilliseconds,
Success = success,
ErrorMessage = errorMessage
};
_db.PageFetches.Add(entity);
await _db.SaveChangesAsync(ct);
return new FetchPageResponse
{
Url = finalUrl,
StatusCode = statusCode ?? 0,
Html = html,
Text = text,
Success = success,
Error = errorMessage
};
}
/// <summary>
/// Strips script/style blocks and all HTML tags from raw HTML, normalises whitespace,
/// and truncates to <see cref="PageFetcherSettings.MaxTextChars"/>.
/// </summary>
private string ExtractText(string html)
{
if (string.IsNullOrWhiteSpace(html)) return string.Empty;
var text = html;
text = Regex.Replace(text, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<[^>]+>", " ");
text = WebUtility.HtmlDecode(text);
text = string.Join(' ', text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
var max = Math.Max(4_000, _settings.MaxTextChars);
return text.Length <= max ? text : text[..max];
}
}
@@ -0,0 +1,49 @@
using Microsoft.Playwright;
namespace Api.Services;
/// <summary>
/// Singleton hosted service that owns the Playwright Chromium browser process for the lifetime of the application.
/// Launches the browser once at startup and exposes it for injection into <see cref="PageFetcherService"/>.
/// </summary>
public sealed class PlaywrightBrowserService : IHostedService, IAsyncDisposable
{
private IPlaywright? _playwright;
private IBrowser? _browser;
private readonly ILogger<PlaywrightBrowserService> _logger;
public PlaywrightBrowserService(ILogger<PlaywrightBrowserService> logger)
{
_logger = logger;
}
/// <summary>The running Chromium browser instance. Available after <see cref="StartAsync"/> completes.</summary>
public IBrowser Browser => _browser ?? throw new InvalidOperationException("Browser has not been started yet.");
/// <inheritdoc />
public async Task StartAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Launching Playwright Chromium browser...");
_playwright = await Playwright.CreateAsync();
_browser = await _playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
{
Headless = true,
Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
});
_logger.LogInformation("Playwright Chromium browser launched successfully.");
}
/// <inheritdoc />
public async Task StopAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Closing Playwright Chromium browser...");
if (_browser is not null) await _browser.CloseAsync();
}
/// <inheritdoc />
public async ValueTask DisposeAsync()
{
if (_browser is not null) await _browser.DisposeAsync();
_playwright?.Dispose();
}
}
+73
View File
@@ -0,0 +1,73 @@
{
"Serilog": {
"Using": [
"Serilog.Sinks.Console",
"Serilog.Sinks.File"
],
"MinimumLevel": {
"Default": "Information",
"Override": {
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Hosting": "Information",
"Microsoft.AspNetCore.Routing": "Warning",
"System.Net.Http.HttpClient": "Warning",
"PageFetcherApi": "Information"
}
},
"WriteTo": [
{
"Name": "Console",
"Args": {
"outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}"
}
},
{
"Name": "File",
"Args": {
"path": "logs/page-fetcher-api-.log",
"rollingInterval": "Day",
"retainedFileCountLimit": 30,
"outputTemplate": "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}"
}
}
],
"Enrich": [
"FromLogContext",
"WithMachineName",
"WithEnvironmentName"
]
},
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Hosting": "Information",
"Microsoft.AspNetCore.Routing": "Warning",
"System.Net.Http.HttpClient": "Warning",
"PageFetcherApi": "Information"
}
},
"LogEnvironmentOnStartup": true,
"AllowedHosts": "*",
"KeyVault": {
"VaultUri": "",
"Enabled": false
},
"Database": {
"Host": "localhost",
"Port": 1433,
"Name": "MyAiDb",
"User": "sa",
"Password": "",
"TrustServerCertificate": true
},
"InternalApi": {
"ApiKey": "",
"RequireApiKey": true
},
"PageFetcher": {
"DefaultWaitFor": "networkidle",
"TimeoutSeconds": 30,
"MaxTextChars": 60000
}
}
@@ -0,0 +1,34 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
<RootNamespace>PageFetcherApi</RootNamespace>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<NoWarn>$(NoWarn);1591</NoWarn>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Playwright" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Serilog.AspNetCore" />
<PackageReference Include="Serilog.Enrichers.Environment" />
<PackageReference Include="Serilog.Sinks.Console" />
<PackageReference Include="Serilog.Sinks.File" />
<PackageReference Include="Serilog.Sinks.Email" />
<PackageReference Include="Swashbuckle.AspNetCore" />
<PackageReference Include="Swashbuckle.AspNetCore.Annotations" />
<PackageReference Include="DotNetEnv" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\page-fetcher-data\page-fetcher-data.csproj" />
<ProjectReference Include="..\page-fetcher-api-models\page-fetcher-api-models.csproj" />
<ProjectReference Include="..\common\common.csproj" />
<ProjectReference Include="..\..\Helpers\startup-helpers\startup-helpers.csproj" />
<ProjectReference Include="..\..\Helpers\common-helpers\common-helpers.csproj" />
</ItemGroup>
</Project>
@@ -0,0 +1,34 @@
using Shared.Data.Entities;
namespace PageFetcher.Data.Entities;
/// <summary>
/// Audit record of a single page-fetch operation performed by the page-fetcher-api.
/// Stores the full rendered HTML and extracted plain text for every URL fetched.
/// </summary>
public sealed class PageFetchEntity : BaseEntity
{
/// <summary>The URL that was requested.</summary>
public string Url { get; set; } = string.Empty;
/// <summary>Name of the service that requested the fetch (e.g. <c>cv-matcher-api</c>, <c>cv-search-job</c>).</summary>
public string CallerService { get; set; } = string.Empty;
/// <summary>HTTP status code returned by the remote server. <c>null</c> on network failure.</summary>
public int? HttpStatusCode { get; set; }
/// <summary>Full rendered HTML as returned by Playwright.</summary>
public string Html { get; set; } = string.Empty;
/// <summary>Plain text extracted from the HTML (script/style stripped, whitespace normalised).</summary>
public string Text { get; set; } = string.Empty;
/// <summary>Playwright round-trip time in milliseconds.</summary>
public long DurationMs { get; set; }
/// <summary><c>true</c> when the page was fetched successfully; <c>false</c> on timeout or network error.</summary>
public bool Success { get; set; }
/// <summary>Exception message when <see cref="Success"/> is <c>false</c>.</summary>
public string? ErrorMessage { get; set; }
}
@@ -0,0 +1,8 @@
namespace PageFetcher.Data;
/// <summary>Schema and migration-history table name constants for the pageFetcher EF schema.</summary>
public static class MigrationConstants
{
public const string SchemaName = "pageFetcher";
public const string MigrationTableName = "_Migrations";
}
@@ -0,0 +1,82 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Metadata;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using PageFetcher.Data;
#nullable disable
namespace PageFetcher.Data.Migrations
{
[DbContext(typeof(PageFetchDbContext))]
[Migration("20260608143523_InitialSchema")]
partial class InitialSchema
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder
.HasDefaultSchema("pageFetcher")
.HasAnnotation("ProductVersion", "10.0.7")
.HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("PageFetcher.Data.Entities.PageFetchEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("CallerService")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<long>("DurationMs")
.HasColumnType("bigint");
b.Property<string>("ErrorMessage")
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.Property<string>("Html")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<int?>("HttpStatusCode")
.HasColumnType("int");
b.Property<bool>("Success")
.HasColumnType("bit");
b.Property<string>("Text")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<string>("Url")
.IsRequired()
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.HasKey("Id");
b.HasIndex("CreatedAt");
b.HasIndex("Url");
b.ToTable("PageFetches", "pageFetcher");
});
#pragma warning restore 612, 618
}
}
}
@@ -0,0 +1,59 @@
using System;
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace PageFetcher.Data.Migrations
{
/// <inheritdoc />
public partial class InitialSchema : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.EnsureSchema(
name: MigrationConstants.SchemaName);
migrationBuilder.CreateTable(
name: "PageFetches",
schema: MigrationConstants.SchemaName,
columns: table => new
{
Id = table.Column<string>(type: "nvarchar(64)", maxLength: 64, nullable: false),
Url = table.Column<string>(type: "nvarchar(2000)", maxLength: 2000, nullable: false),
CallerService = table.Column<string>(type: "nvarchar(64)", maxLength: 64, nullable: false),
HttpStatusCode = table.Column<int>(type: "int", nullable: true),
Html = table.Column<string>(type: "nvarchar(max)", nullable: false),
Text = table.Column<string>(type: "nvarchar(max)", nullable: false),
DurationMs = table.Column<long>(type: "bigint", nullable: false),
Success = table.Column<bool>(type: "bit", nullable: false),
ErrorMessage = table.Column<string>(type: "nvarchar(2000)", maxLength: 2000, nullable: true),
CreatedAt = table.Column<DateTime>(type: "datetime2", nullable: false, defaultValueSql: "SYSUTCDATETIME()")
},
constraints: table =>
{
table.PrimaryKey("PK_PageFetches", x => x.Id);
});
migrationBuilder.CreateIndex(
name: "IX_PageFetches_CreatedAt",
schema: MigrationConstants.SchemaName,
table: "PageFetches",
column: "CreatedAt");
migrationBuilder.CreateIndex(
name: "IX_PageFetches_Url",
schema: MigrationConstants.SchemaName,
table: "PageFetches",
column: "Url");
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropTable(
name: "PageFetches",
schema: MigrationConstants.SchemaName);
}
}
}
@@ -0,0 +1,79 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Metadata;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using PageFetcher.Data;
#nullable disable
namespace PageFetcher.Data.Migrations
{
[DbContext(typeof(PageFetchDbContext))]
partial class PageFetchDbContextModelSnapshot : ModelSnapshot
{
protected override void BuildModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder
.HasDefaultSchema("pageFetcher")
.HasAnnotation("ProductVersion", "10.0.7")
.HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("PageFetcher.Data.Entities.PageFetchEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("CallerService")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<long>("DurationMs")
.HasColumnType("bigint");
b.Property<string>("ErrorMessage")
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.Property<string>("Html")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<int?>("HttpStatusCode")
.HasColumnType("int");
b.Property<bool>("Success")
.HasColumnType("bit");
b.Property<string>("Text")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<string>("Url")
.IsRequired()
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.HasKey("Id");
b.HasIndex("CreatedAt");
b.HasIndex("Url");
b.ToTable("PageFetches", "pageFetcher");
});
#pragma warning restore 612, 618
}
}
}
@@ -0,0 +1,45 @@
using Microsoft.EntityFrameworkCore;
using PageFetcher.Data.Entities;
namespace PageFetcher.Data;
/// <summary>
/// EF Core DbContext for the <c>pageFetcher</c> schema.
/// Owns the <c>PageFetches</c> audit table.
/// </summary>
public sealed class PageFetchDbContext : DbContext
{
public const string SchemaName = MigrationConstants.SchemaName;
public const string MigrationTableName = MigrationConstants.MigrationTableName;
public PageFetchDbContext(DbContextOptions<PageFetchDbContext> options) : base(options) { }
public DbSet<PageFetchEntity> PageFetches => Set<PageFetchEntity>();
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
base.OnConfiguring(optionsBuilder);
optionsBuilder.UseSqlServer(x => x.MigrationsHistoryTable(MigrationTableName, SchemaName));
}
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.HasDefaultSchema(SchemaName);
modelBuilder.Entity<PageFetchEntity>(entity =>
{
entity.ToTable("PageFetches");
entity.HasKey(x => x.Id);
entity.Property(x => x.Id).HasMaxLength(64);
entity.Property(x => x.Url).HasMaxLength(2000).IsRequired();
entity.Property(x => x.CallerService).HasMaxLength(64).IsRequired();
entity.Property(x => x.Html).IsRequired();
entity.Property(x => x.Text).IsRequired();
entity.Property(x => x.ErrorMessage).HasMaxLength(2000);
entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()");
entity.HasIndex(x => x.Url);
entity.HasIndex(x => x.CreatedAt);
});
}
}
@@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AssemblyName>page-fetcher-data</AssemblyName>
<RootNamespace>PageFetcher.Data</RootNamespace>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\shared-data\shared-data.csproj" />
</ItemGroup>
</Project>
+1 -1
View File
@@ -18,7 +18,7 @@
<!-- Config -->
<PackageVersion Include="DotNetEnv" Version="3.2.0" />
<!-- HTTP / Refit -->
<PackageVersion Include="Refit.HttpClientFactory" Version="10.1.6" />
<PackageVersion Include="Refit.HttpClientFactory" Version="11.0.1" />
<!-- Serilog -->
<PackageVersion Include="Serilog.AspNetCore" Version="10.0.0" />
<PackageVersion Include="Serilog.Enrichers.Environment" Version="3.0.1" />
+2
View File
@@ -9,6 +9,7 @@ COPY Apis/cv-search-data/cv-search-data.csproj Apis/cv-search-data/
COPY Apis/cv-matcher-api-models/cv-matcher-api-models.csproj Apis/cv-matcher-api-models/
COPY Apis/email-data/email-data.csproj Apis/email-data/
COPY Apis/email-api-models/email-api-models.csproj Apis/email-api-models/
COPY Apis/page-fetcher-api-models/page-fetcher-api-models.csproj Apis/page-fetcher-api-models/
COPY Apis/common/common.csproj Apis/common/
COPY Apis/myai-data/myai-data.csproj Apis/myai-data/
COPY Apis/shared-data/shared-data.csproj Apis/shared-data/
@@ -22,6 +23,7 @@ COPY Apis/cv-search-data/ Apis/cv-search-data/
COPY Apis/cv-matcher-api-models/ Apis/cv-matcher-api-models/
COPY Apis/email-data/ Apis/email-data/
COPY Apis/email-api-models/ Apis/email-api-models/
COPY Apis/page-fetcher-api-models/ Apis/page-fetcher-api-models/
COPY Apis/common/ Apis/common/
COPY Apis/myai-data/ Apis/myai-data/
COPY Apis/shared-data/ Apis/shared-data/
+15 -2
View File
@@ -7,13 +7,14 @@ using Email.Data;
using Email.Data.Repositories;
using Email.Data.Repositories.Contracts;
using Email.Data.Services;
using EmailApi.Models.Clients;
using Email.Models.Clients;
using CvSearchJob.Tasks;
using JobScheduler.Scheduling;
using JobScheduler.Tasks;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using PageFetcher.Models;
using Refit;
using Serilog;
using Common.Settings;
@@ -81,7 +82,19 @@ try
client.DefaultRequestHeaders.Add("X-Internal-Api-Key", key);
});
builder.Services.AddHttpClient<HtmlJobSearcher>();
builder.Services.AddRefitClient<IPageFetcherApiClient>()
.ConfigureHttpClient((sp, client) =>
{
var config = sp.GetRequiredService<Microsoft.Extensions.Configuration.IConfiguration>();
var baseUrl = config["PageFetcherApi:BaseUrl"] ?? string.Empty;
if (!string.IsNullOrWhiteSpace(baseUrl))
client.BaseAddress = new Uri(baseUrl.TrimEnd('/') + "/");
var key = config["PageFetcherApi:InternalApiKey"];
if (!string.IsNullOrWhiteSpace(key))
client.DefaultRequestHeaders.Add("X-Internal-Api-Key", key);
});
builder.Services.AddSingleton<HtmlJobSearcher>();
builder.Services.AddSingleton<CvSearchEmailSender>();
builder.Services.AddSingleton<CvSearchJobTask>();
@@ -1,8 +1,8 @@
using CvMatcher.Models.Responses;
using CvSearch.Data.Entities;
using Email.Data.Services;
using EmailApi.Models.Clients;
using EmailApi.Models.Requests;
using Email.Models.Clients;
using Email.Models.Requests;
using Microsoft.Extensions.Logging;
namespace CvSearchJob.Services;
+32 -81
View File
@@ -1,36 +1,39 @@
using System.Text.RegularExpressions;
using System.Web;
using CvMatcher.Models.Settings;
using Microsoft.Playwright;
using PageFetcher.Models;
using Microsoft.Extensions.Logging;
namespace CvSearchJob.Services;
/// <summary>
/// Config-driven HTML scraper that fetches a provider's job listing page and extracts matching job URLs.
/// Uses a two-stage anchor filter: href must contain the provider's link pattern, and anchor text must
/// contain at least one CV keyword.
/// Supports both plain HTTP GET (default) and headless Chromium rendering for JS-heavy SPAs.
/// A URL and its anchor text as scraped from a job listing search-results page.
/// </summary>
public sealed record JobCandidate(string Url, string Title);
/// <summary>
/// Config-driven HTML scraper that fetches a provider's job listing page via <c>page-fetcher-api</c>
/// and extracts matching job URL candidates.
/// Uses a two-stage anchor filter: href must contain the provider's link pattern, and (optionally)
/// anchor text must contain at least one CV keyword.
/// </summary>
public sealed class HtmlJobSearcher
{
private readonly HttpClient _http;
private readonly IPageFetcherApiClient _pageFetcher;
private readonly ILogger<HtmlJobSearcher> _logger;
public HtmlJobSearcher(HttpClient http, ILogger<HtmlJobSearcher> logger)
public HtmlJobSearcher(IPageFetcherApiClient pageFetcher, ILogger<HtmlJobSearcher> logger)
{
_http = http;
_pageFetcher = pageFetcher;
_logger = logger;
_http.Timeout = TimeSpan.FromSeconds(20);
_http.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; MyAi.ro CV-Search/1.0)");
}
/// <summary>
/// Fetches the provider's search result page for the combined initial + CV keywords, parses all anchor
/// tags, applies the two-stage filter, and returns up to <see cref="JobProviderConfig.MaxResults"/> absolute URLs.
/// Returns an empty list when the HTTP request fails rather than throwing.
/// Fetches the provider's search result page, parses all anchor tags, applies the two-stage filter,
/// and returns up to <see cref="JobProviderConfig.MaxResults"/> candidates (URL + title).
/// Returns an empty list when the page fetch fails rather than throwing.
/// </summary>
public async Task<IReadOnlyList<string>> SearchJobUrlsAsync(
public async Task<IReadOnlyList<JobCandidate>> SearchJobUrlsAsync(
JobProviderConfig provider,
IReadOnlyList<string> cvKeywords,
string? location,
@@ -61,24 +64,29 @@ public sealed class HtmlJobSearcher
.Replace("{location-slug}", locationSlug);
_logger.LogInformation(
"Provider {Provider}: fetching {Url} [{Mode}] | CV keywords: [{Keywords}] | Location: {Location}",
"Provider {Provider}: fetching {Url} | CV keywords: [{Keywords}] | Location: {Location}",
provider.Name, searchUrl,
provider.UseHeadlessBrowser ? "headless" : "http",
string.Join(", ", cvKeywords),
location ?? "(none)");
string? html;
if (provider.UseHeadlessBrowser)
html = await FetchWithPlaywrightAsync(provider.Name, searchUrl, ct);
else
html = await FetchWithHttpAsync(provider.Name, searchUrl, ct);
var fetchResponse = await _pageFetcher.FetchAsync(new FetchPageRequest
{
Url = searchUrl,
WaitFor = provider.UseHeadlessBrowser ? "networkidle" : "domcontentloaded",
CallerService = "cv-search-job"
}, ct);
if (html is null) return [];
if (!fetchResponse.Success || string.IsNullOrWhiteSpace(fetchResponse.Html))
{
_logger.LogWarning("Provider {Provider}: page fetch failed — {Error}", provider.Name, fetchResponse.Error);
return [];
}
var html = fetchResponse.Html;
_logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length);
var baseUri = new Uri(searchUrl);
var results = new List<string>();
var results = new List<JobCandidate>();
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var anchorPattern = new Regex(@"<a[^>]+href=[""']([^""']+)[""'][^>]*>(.*?)</a>",
@@ -123,7 +131,7 @@ public sealed class HtmlJobSearcher
var url = absoluteUri.GetLeftPart(UriPartial.Path);
if (seen.Add(url))
results.Add(url);
results.Add(new JobCandidate(url, anchorText));
}
_logger.LogInformation(
@@ -132,61 +140,4 @@ public sealed class HtmlJobSearcher
return results;
}
private async Task<string?> FetchWithHttpAsync(string providerName, string url, CancellationToken ct)
{
try
{
return await _http.GetStringAsync(url, ct);
}
catch (Exception ex)
{
_logger.LogError(ex, "Provider {Provider}: HTTP fetch failed for {Url}", providerName, url);
return null;
}
}
private async Task<string?> FetchWithPlaywrightAsync(string providerName, string url, CancellationToken ct)
{
try
{
using var playwright = await Playwright.CreateAsync();
await using var browser = await playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
{
Headless = true,
Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
});
var page = await browser.NewPageAsync();
IResponse? response;
try
{
response = await page.GotoAsync(url, new PageGotoOptions
{
WaitUntil = WaitUntilState.NetworkIdle,
Timeout = 30_000
});
}
catch (TimeoutException)
{
// NetworkIdle timed out — use whatever content rendered so far
_logger.LogWarning("Provider {Provider}: Playwright NetworkIdle timeout for {Url}, using partial content", providerName, url);
return await page.ContentAsync();
}
if (response is null || response.Status >= 400)
{
_logger.LogWarning("Provider {Provider}: Playwright got HTTP {Status} for {Url}", providerName, response?.Status, url);
return null;
}
return await page.ContentAsync();
}
catch (Exception ex)
{
_logger.LogError(ex, "Provider {Provider}: Playwright fetch failed for {Url}", providerName, url);
return null;
}
}
}
+44 -10
View File
@@ -11,6 +11,7 @@ using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using PageFetcher.Models;
namespace CvSearchJob.Tasks;
@@ -24,6 +25,7 @@ public sealed class CvSearchJobTask : IJobTask
private readonly JobSearchSettings _settings;
private readonly HtmlJobSearcher _searcher;
private readonly ICvMatcherInternalApi _matcherApi;
private readonly IPageFetcherApiClient _pageFetcher;
private readonly CvSearchEmailSender _emailSender;
private readonly ILogger<CvSearchJobTask> _logger;
@@ -34,6 +36,7 @@ public sealed class CvSearchJobTask : IJobTask
IOptions<JobSearchSettings> settings,
HtmlJobSearcher searcher,
ICvMatcherInternalApi matcherApi,
IPageFetcherApiClient pageFetcher,
CvSearchEmailSender emailSender,
ILogger<CvSearchJobTask> logger)
{
@@ -41,6 +44,7 @@ public sealed class CvSearchJobTask : IJobTask
_settings = settings.Value;
_searcher = searcher;
_matcherApi = matcherApi;
_pageFetcher = pageFetcher;
_emailSender = emailSender;
_logger = logger;
}
@@ -126,7 +130,8 @@ public sealed class CvSearchJobTask : IJobTask
/// <summary>
/// Runs the full search pipeline for a session: scrapes all providers, deduplicates URLs,
/// scores each candidate via the matcher API, and persists results that meet the minimum score threshold.
/// fetches each individual job page via page-fetcher-api, applies a keyword pre-filter,
/// scores passing candidates via the matcher API, and persists results that meet the minimum score threshold.
/// </summary>
private async Task<List<JobSearchResultEntity>> RunSearchAsync(
JobSearchSessionEntity session,
@@ -138,30 +143,59 @@ public sealed class CvSearchJobTask : IJobTask
if (cvKeywords.Count == 0)
_logger.LogWarning("Session {SessionId}: keyword list is empty — scraper will rely on provider InitialKeywords only", session.Id);
var jobUrls = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var jobCandidates = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase); // url → title
foreach (var provider in providers)
{
var urls = await _searcher.SearchJobUrlsAsync(provider, cvKeywords, session.Location, ct);
_logger.LogInformation("Session {SessionId}: provider {Provider} returned {Count} URLs", session.Id, provider.Name, urls.Count);
foreach (var url in urls) jobUrls.Add(url);
var candidates = await _searcher.SearchJobUrlsAsync(provider, cvKeywords, session.Location, ct);
_logger.LogInformation("Session {SessionId}: provider {Provider} returned {Count} candidates", session.Id, provider.Name, candidates.Count);
foreach (var c in candidates)
jobCandidates.TryAdd(c.Url, c.Title);
}
var candidates = jobUrls.Take(_settings.MaxJobsToMatch).ToList();
var deduped = jobCandidates.Take(_settings.MaxJobsToMatch).ToList();
_logger.LogInformation(
"Session {SessionId}: {Total} unique URLs across all providers, scoring {Scoring} (cap={Cap})",
session.Id, jobUrls.Count, candidates.Count, _settings.MaxJobsToMatch);
"Session {SessionId}: {Total} unique URLs across all providers, processing up to {Cap}",
session.Id, jobCandidates.Count, deduped.Count);
var results = new List<JobSearchResultEntity>();
foreach (var url in candidates)
foreach (var (url, title) in deduped)
{
try
{
// Fetch individual job page text via page-fetcher-api
var fetchResponse = await _pageFetcher.FetchAsync(new FetchPageRequest
{
Url = url,
WaitFor = "domcontentloaded",
CallerService = "cv-search-job"
}, ct);
if (!fetchResponse.Success || string.IsNullOrWhiteSpace(fetchResponse.Text))
{
_logger.LogWarning("Session {SessionId}: fetch failed for {Url} — {Error}", session.Id, url, fetchResponse.Error);
continue;
}
var jobText = fetchResponse.Text;
// Keyword pre-filter: skip LLM call if no CV keyword appears in the job page text
if (cvKeywords.Count > 0 &&
!cvKeywords.Any(k => jobText.Contains(k, StringComparison.OrdinalIgnoreCase)))
{
_logger.LogInformation(
"Session {SessionId}: pre-filter skip | {Url} | no CV keyword found in job text",
session.Id, url);
continue;
}
var matchRequest = new MatchJobRequest
{
CvDocumentId = session.CvDocumentId,
JobUrl = url,
// Pre-fetched text passed directly so cv-matcher-api skips re-fetching the page
JobDescription = jobText,
// User already gave GDPR consent when they clicked the one-time job search link
GdprConsent = true
};
@@ -182,7 +216,7 @@ public sealed class CvSearchJobTask : IJobTask
SessionId = session.Id,
ProviderName = GuessProvider(url, providers),
JobUrl = url,
JobTitle = matchResult.Summary.Split('.').FirstOrDefault()?.Trim() ?? "Job",
JobTitle = matchResult.Summary.Split('.').FirstOrDefault()?.Trim() ?? title,
JobText = string.Empty,
Score = matchResult.Score,
ResultJson = JsonSerializer.Serialize(matchResult, new JsonSerializerOptions(JsonSerializerDefaults.Web)),
+1 -1
View File
@@ -13,7 +13,6 @@
<PackageReference Include="Microsoft.Extensions.Hosting" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Refit.HttpClientFactory" />
<PackageReference Include="Microsoft.Playwright" />
</ItemGroup>
<ItemGroup>
@@ -26,6 +25,7 @@
<ProjectReference Include="..\..\Apis\cv-search-data\cv-search-data.csproj" />
<ProjectReference Include="..\..\Apis\common\common.csproj" />
<ProjectReference Include="..\..\Apis\email-data\email-data.csproj" />
<ProjectReference Include="..\..\Apis\page-fetcher-api-models\page-fetcher-api-models.csproj" />
<ProjectReference Include="..\..\Helpers\startup-helpers\startup-helpers.csproj" />
<ProjectReference Include="..\job-scheduler\job-scheduler.csproj" />
</ItemGroup>
+38
View File
@@ -72,6 +72,9 @@ services:
- RagApi__BaseUrl=${RagApi__BaseUrl:-http://rag-api:8080}
- RagApi__InternalApiKey=${RagApi__InternalApiKey:-}
- PageFetcherApi__BaseUrl=${PageFetcherApi__BaseUrl:-http://page-fetcher-api:8080}
- PageFetcherApi__InternalApiKey=${PageFetcherApi__InternalApiKey:-}
- Ai__Provider=${Ai__Provider:-OpenAI}
- Ai__OpenAI__ApiKey=${Ai__OpenAI__ApiKey:-}
- Ai__OpenAI__ChatModel=${Ai__OpenAI__ChatModel:-gpt-4o-mini}
@@ -266,6 +269,9 @@ services:
- EmailApi__BaseUrl=${EmailApi__BaseUrl:-http://email-api:8080}
- EmailApi__InternalApiKey=${EmailApi__InternalApiKey:-}
- PageFetcherApi__BaseUrl=${PageFetcherApi__BaseUrl:-http://page-fetcher-api:8080}
- PageFetcherApi__InternalApiKey=${PageFetcherApi__InternalApiKey:-}
- FileStorage__Path=${FileStorage__Path:-Files}
- JobSearch__Enabled=${JobSearch__Enabled:-true}
@@ -293,6 +299,38 @@ services:
labels:
- "com.centurylinklabs.watchtower.enable=true"
page-fetcher-api:
image: registry.easysoft.ro/apps/myai-page-fetcher-api:${IMAGE_TAG:-staging}
container_name: myai-page-fetcher-api
environment:
- ASPNETCORE_ENVIRONMENT=${ASPNETCORE_ENVIRONMENT:-Staging}
- ASPNETCORE_URLS=${ASPNETCORE_URLS:-http://+:8080}
- APP_ENVIRONMENT_NAME=${APP_ENVIRONMENT_NAME:-myai.staging}
- Database__Host=${Database__Host:-sqlserver}
- Database__Port=${Database__Port:-1433}
- Database__Name=${Database__Name:-MyAiDb}
- Database__User=${Database__User:-sa}
- Database__Password=${Database__Password:-}
- Database__TrustServerCertificate=${Database__TrustServerCertificate:-true}
- InternalApi__ApiKey=${PageFetcherApi__InternalApiKey:-}
- InternalApi__RequireApiKey=true
- SerilogEmail__From=${SerilogEmail__From:-}
- SerilogEmail__To=${SerilogEmail__To:-}
- SerilogEmail__Host=${SerilogEmail__Host:-}
- SerilogEmail__Port=${SerilogEmail__Port:-587}
- SerilogEmail__UserName=${SerilogEmail__UserName:-}
- SerilogEmail__Password=${SerilogEmail__Password:-}
volumes:
- ${LOGS_PATH:-/opt/myai/logs}/page-fetcher-api:/app/logs
networks:
- myai-network
restart: unless-stopped
labels:
- "com.centurylinklabs.watchtower.enable=true"
web:
image: registry.easysoft.ro/apps/myai-web:${IMAGE_TAG:-staging}
container_name: myai-web
+45
View File
@@ -63,6 +63,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "email-api", "Apis\email-api
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "email-data", "Apis\email-data\email-data.csproj", "{C1D2E3F4-A5B6-4789-CDEF-012345678ABC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "page-fetcher-api-models", "Apis\page-fetcher-api-models\page-fetcher-api-models.csproj", "{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "page-fetcher-data", "Apis\page-fetcher-data\page-fetcher-data.csproj", "{06F803CD-329D-40C2-B62D-0F14E137D3C7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "page-fetcher-api", "Apis\page-fetcher-api\page-fetcher-api.csproj", "{FC5A722A-7B12-459E-AB9F-0A724797783E}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -357,6 +363,42 @@ Global
{C1D2E3F4-A5B6-4789-CDEF-012345678ABC}.Release|x64.Build.0 = Release|Any CPU
{C1D2E3F4-A5B6-4789-CDEF-012345678ABC}.Release|x86.ActiveCfg = Release|Any CPU
{C1D2E3F4-A5B6-4789-CDEF-012345678ABC}.Release|x86.Build.0 = Release|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x64.ActiveCfg = Debug|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x64.Build.0 = Debug|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x86.ActiveCfg = Debug|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x86.Build.0 = Debug|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|Any CPU.Build.0 = Release|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x64.ActiveCfg = Release|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x64.Build.0 = Release|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x86.ActiveCfg = Release|Any CPU
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x86.Build.0 = Release|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x64.ActiveCfg = Debug|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x64.Build.0 = Debug|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x86.ActiveCfg = Debug|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x86.Build.0 = Debug|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|Any CPU.Build.0 = Release|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x64.ActiveCfg = Release|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x64.Build.0 = Release|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x86.ActiveCfg = Release|Any CPU
{06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x86.Build.0 = Release|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x64.ActiveCfg = Debug|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x64.Build.0 = Debug|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x86.ActiveCfg = Debug|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x86.Build.0 = Debug|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|Any CPU.Build.0 = Release|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x64.ActiveCfg = Release|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x64.Build.0 = Release|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x86.ActiveCfg = Release|Any CPU
{FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -385,6 +427,9 @@ Global
{BE44B4EB-9AB9-4D81-A9BF-5CF2832BEEE5} = {A9B8C7D6-E5F4-4321-ABCD-FEDCBA987654}
{434119EA-2FFC-4433-9B8E-1E6D94006413} = {0FE6558F-2157-47F2-A835-558416CE0E2B}
{C1D2E3F4-A5B6-4789-CDEF-012345678ABC} = {D4E5F6A7-B8C9-4012-3456-789ABCDEF012}
{4F1A669E-C8AF-428F-87E7-3E0A213DD20B} = {A9B8C7D6-E5F4-4321-ABCD-FEDCBA987654}
{06F803CD-329D-40C2-B62D-0F14E137D3C7} = {D4E5F6A7-B8C9-4012-3456-789ABCDEF012}
{FC5A722A-7B12-459E-AB9F-0A724797783E} = {0FE6558F-2157-47F2-A835-558416CE0E2B}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {6246A67B-299E-4E64-8DBE-1A66771E7C67}