From 898dd09d50913ea481d46df452b4800895b72e9f Mon Sep 17 00:00:00 2001 From: claude Date: Mon, 8 Jun 2026 17:43:56 +0300 Subject: [PATCH] =?UTF-8?q?feat:=20add=20page-fetcher-api=20=E2=80=94=20ce?= =?UTF-8?q?ntralised=20Playwright=20page=20fetcher?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces page-fetcher-api, a new internal ASP.NET Core service that centralises all web-page fetching through a single Playwright (headless Chromium) browser instance. All fetches are persisted to the pageFetcher SQL schema for auditing. New projects: - Apis/page-fetcher-api-models: FetchPageRequest, FetchPageResponse, IPageFetcherApiClient - Apis/page-fetcher-data: PageFetchDbContext, PageFetchEntity, InitialSchema migration (schema: pageFetcher) - Apis/page-fetcher-api: PlaywrightBrowserService (singleton), PageFetcherService, PageController Changes to existing services: - cv-matcher-api: JobTextExtractor now calls IPageFetcherApiClient instead of HttpClient - cv-search-job: HtmlJobSearcher uses IPageFetcherApiClient (removes inline Playwright); CvSearchJobTask fetches individual job pages and applies keyword pre-filter before LLM call; passes pre-fetched JobDescription to cv-matcher-api to skip re-fetch - common: add PageFetcherApiSettings - docker-compose.yml, build.yml: add new service + env vars for callers Closes #43 Co-Authored-By: Claude Sonnet 4.6 --- .gitea/workflows/build.yml | 11 +- .../common/Settings/PageFetcherApiSettings.cs | 11 ++ Apis/cv-matcher-api/Program.cs | 13 +- .../Services/JobTextExtractor.cs | 30 ++-- Apis/cv-matcher-api/cv-matcher-api.csproj | 1 + .../FetchPageRequest.cs | 20 +++ .../FetchPageResponse.cs | 25 +++ .../IPageFetcherApiClient.cs | 16 ++ .../page-fetcher-api-models.csproj | 12 ++ .../Controllers/PageController.cs | 47 ++++++ Apis/page-fetcher-api/Dockerfile | 50 ++++++ Apis/page-fetcher-api/Program.cs | 74 +++++++++ .../Properties/launchSettings.json | 12 ++ .../Services/PageFetcherService.cs | 143 ++++++++++++++++++ .../Services/PageFetcherSettings.cs | 17 +++ .../Services/PlaywrightBrowserService.cs | 49 ++++++ Apis/page-fetcher-api/appsettings.json | 73 +++++++++ Apis/page-fetcher-api/page-fetcher-api.csproj | 34 +++++ .../Data/Entities/PageFetchEntity.cs | 34 +++++ Apis/page-fetcher-data/MigrationConstants.cs | 8 + .../20260608143523_InitialSchema.Designer.cs | 82 ++++++++++ .../20260608143523_InitialSchema.cs | 59 ++++++++ .../PageFetchDbContextModelSnapshot.cs | 79 ++++++++++ Apis/page-fetcher-data/PageFetchDbContext.cs | 45 ++++++ .../page-fetcher-data.csproj | 19 +++ Jobs/cv-search-job/Program.cs | 15 +- .../cv-search-job/Services/HtmlJobSearcher.cs | 113 ++++---------- Jobs/cv-search-job/Tasks/CvSearchJobTask.cs | 54 +++++-- Jobs/cv-search-job/cv-search-job.csproj | 2 +- docker-compose/docker-compose.yml | 38 +++++ myAi.sln | 45 ++++++ 31 files changed, 1121 insertions(+), 110 deletions(-) create mode 100644 Apis/common/Settings/PageFetcherApiSettings.cs create mode 100644 Apis/page-fetcher-api-models/FetchPageRequest.cs create mode 100644 Apis/page-fetcher-api-models/FetchPageResponse.cs create mode 100644 Apis/page-fetcher-api-models/IPageFetcherApiClient.cs create mode 100644 Apis/page-fetcher-api-models/page-fetcher-api-models.csproj create mode 100644 Apis/page-fetcher-api/Controllers/PageController.cs create mode 100644 Apis/page-fetcher-api/Dockerfile create mode 100644 Apis/page-fetcher-api/Program.cs create mode 100644 Apis/page-fetcher-api/Properties/launchSettings.json create mode 100644 Apis/page-fetcher-api/Services/PageFetcherService.cs create mode 100644 Apis/page-fetcher-api/Services/PageFetcherSettings.cs create mode 100644 Apis/page-fetcher-api/Services/PlaywrightBrowserService.cs create mode 100644 Apis/page-fetcher-api/appsettings.json create mode 100644 Apis/page-fetcher-api/page-fetcher-api.csproj create mode 100644 Apis/page-fetcher-data/Data/Entities/PageFetchEntity.cs create mode 100644 Apis/page-fetcher-data/MigrationConstants.cs create mode 100644 Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.Designer.cs create mode 100644 Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.cs create mode 100644 Apis/page-fetcher-data/Migrations/PageFetchDbContextModelSnapshot.cs create mode 100644 Apis/page-fetcher-data/PageFetchDbContext.cs create mode 100644 Apis/page-fetcher-data/page-fetcher-data.csproj diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml index 77e6f41..850a436 100644 --- a/.gitea/workflows/build.yml +++ b/.gitea/workflows/build.yml @@ -15,6 +15,7 @@ env: WEB_IMAGE: apps/myai-web CV_CLEANUP_JOB_IMAGE: apps/myai-cv-cleanup-job CV_SEARCH_JOB_IMAGE: apps/myai-cv-search-job + PAGE_FETCHER_API_IMAGE: apps/myai-page-fetcher-api IMAGE_TAG: staging jobs: @@ -62,6 +63,10 @@ jobs: run: | docker build -f Jobs/cv-search-job/Dockerfile -t "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}" . + - name: Build Page Fetcher API image + run: | + docker build -f Apis/page-fetcher-api/Dockerfile -t "${REGISTRY_HOST}/${PAGE_FETCHER_API_IMAGE}:${IMAGE_TAG}" . + - name: Push API image run: | docker push "${REGISTRY_HOST}/${API_IMAGE}:${IMAGE_TAG}" @@ -88,4 +93,8 @@ jobs: - name: Push CV search job image run: | - docker push "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}" \ No newline at end of file + docker push "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}" + + - name: Push Page Fetcher API image + run: | + docker push "${REGISTRY_HOST}/${PAGE_FETCHER_API_IMAGE}:${IMAGE_TAG}" \ No newline at end of file diff --git a/Apis/common/Settings/PageFetcherApiSettings.cs b/Apis/common/Settings/PageFetcherApiSettings.cs new file mode 100644 index 0000000..c6367ac --- /dev/null +++ b/Apis/common/Settings/PageFetcherApiSettings.cs @@ -0,0 +1,11 @@ +namespace Common.Settings; + +/// +/// Connection settings for the internal page-fetcher-api service. +/// Bound from the PageFetcherApi configuration section. +/// +public sealed class PageFetcherApiSettings +{ + public string BaseUrl { get; set; } = string.Empty; + public string InternalApiKey { get; set; } = string.Empty; +} diff --git a/Apis/cv-matcher-api/Program.cs b/Apis/cv-matcher-api/Program.cs index f247251..0fd642c 100644 --- a/Apis/cv-matcher-api/Program.cs +++ b/Apis/cv-matcher-api/Program.cs @@ -13,6 +13,7 @@ using Microsoft.EntityFrameworkCore; using Refit; using Serilog; using Common.Settings; +using PageFetcher.Models; using StartupHelpers; using System.Reflection; @@ -36,6 +37,16 @@ try builder.Services.Configure(builder.Configuration.GetSection("Ai")); builder.Services.Configure(builder.Configuration.GetSection("Matcher")); builder.Services.Configure(builder.Configuration.GetSection("JobSearch")); + builder.Services.Configure(builder.Configuration.GetSection("PageFetcherApi")); + + builder.Services.AddRefitClient() + .ConfigureHttpClient((sp, c) => + { + var settings = sp.GetRequiredService>().Value; + c.BaseAddress = new Uri(settings.BaseUrl.TrimEnd('/') + "/"); + if (!string.IsNullOrWhiteSpace(settings.InternalApiKey)) + c.DefaultRequestHeaders.Add("X-Internal-Api-Key", settings.InternalApiKey); + }); builder.Services.AddRefitClient() .ConfigureHttpClient((sp, c) => @@ -50,7 +61,7 @@ try builder.Services.AddScoped(); builder.Services.AddHttpClient(); - builder.Services.AddHttpClient(); + builder.Services.AddScoped(); builder.Services.AddDbContext(options => { diff --git a/Apis/cv-matcher-api/Services/JobTextExtractor.cs b/Apis/cv-matcher-api/Services/JobTextExtractor.cs index f8e806b..c16b201 100644 --- a/Apis/cv-matcher-api/Services/JobTextExtractor.cs +++ b/Apis/cv-matcher-api/Services/JobTextExtractor.cs @@ -1,26 +1,23 @@ -using System.Net; -using System.Text.RegularExpressions; using CvMatcher.Models.Settings; using Api.Services.Contracts; using Microsoft.Extensions.Options; +using PageFetcher.Models; namespace Api.Services; /// /// Extracts normalised plain text from a job posting, either from a pasted description or by -/// fetching and stripping the HTML of the job page URL. +/// fetching the job page text via page-fetcher-api (headless Chromium rendering). /// public sealed class JobTextExtractor : IJobTextExtractor { - private readonly HttpClient _http; + private readonly IPageFetcherApiClient _pageFetcher; private readonly MatcherSettings _settings; - public JobTextExtractor(HttpClient http, IOptions options) + public JobTextExtractor(IPageFetcherApiClient pageFetcher, IOptions options) { - _http = http; + _pageFetcher = pageFetcher; _settings = options.Value; - _http.Timeout = TimeSpan.FromSeconds(25); - _http.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0"); } /// @@ -31,15 +28,18 @@ public sealed class JobTextExtractor : IJobTextExtractor if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty; if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || uri.Scheme is not ("http" or "https")) - { throw new InvalidOperationException("Invalid job URL."); - } - var html = await _http.GetStringAsync(uri, ct); - html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase); - html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase); - html = Regex.Replace(html, "<[^>]+>", " "); - return Limit(Normalize(WebUtility.HtmlDecode(html))); + var response = await _pageFetcher.FetchAsync(new FetchPageRequest + { + Url = jobUrl, + CallerService = "cv-matcher-api" + }, ct); + + if (!response.Success) + throw new InvalidOperationException($"Failed to fetch job page: {response.Error}"); + + return Limit(Normalize(response.Text)); } /// Truncates text to the configured maximum character count. diff --git a/Apis/cv-matcher-api/cv-matcher-api.csproj b/Apis/cv-matcher-api/cv-matcher-api.csproj index f56e350..505514a 100644 --- a/Apis/cv-matcher-api/cv-matcher-api.csproj +++ b/Apis/cv-matcher-api/cv-matcher-api.csproj @@ -82,6 +82,7 @@ + diff --git a/Apis/page-fetcher-api-models/FetchPageRequest.cs b/Apis/page-fetcher-api-models/FetchPageRequest.cs new file mode 100644 index 0000000..7a24faa --- /dev/null +++ b/Apis/page-fetcher-api-models/FetchPageRequest.cs @@ -0,0 +1,20 @@ +namespace PageFetcher.Models; + +/// +/// Request to fetch a web page via the page-fetcher-api. +/// +public sealed class FetchPageRequest +{ + /// Absolute HTTP or HTTPS URL to fetch. + public string Url { get; set; } = string.Empty; + + /// + /// Playwright wait condition. Accepted values: networkidle (default), domcontentloaded, load. + /// + public string WaitFor { get; set; } = "networkidle"; + + /// + /// Identifies the calling service for audit purposes (e.g. cv-matcher-api, cv-search-job). + /// + public string CallerService { get; set; } = string.Empty; +} diff --git a/Apis/page-fetcher-api-models/FetchPageResponse.cs b/Apis/page-fetcher-api-models/FetchPageResponse.cs new file mode 100644 index 0000000..e4e17f8 --- /dev/null +++ b/Apis/page-fetcher-api-models/FetchPageResponse.cs @@ -0,0 +1,25 @@ +namespace PageFetcher.Models; + +/// +/// Result of a page fetch operation. +/// +public sealed class FetchPageResponse +{ + /// Final URL after any redirects. + public string Url { get; set; } = string.Empty; + + /// HTTP status code returned by the page. 0 on network failure. + public int StatusCode { get; set; } + + /// Full rendered HTML as returned by Playwright. + public string Html { get; set; } = string.Empty; + + /// Plain text extracted from the HTML (script/style stripped, whitespace normalised). + public string Text { get; set; } = string.Empty; + + /// Whether the fetch succeeded. false on timeout or network error. + public bool Success { get; set; } + + /// Exception message when is false. + public string? Error { get; set; } +} diff --git a/Apis/page-fetcher-api-models/IPageFetcherApiClient.cs b/Apis/page-fetcher-api-models/IPageFetcherApiClient.cs new file mode 100644 index 0000000..bfed7b4 --- /dev/null +++ b/Apis/page-fetcher-api-models/IPageFetcherApiClient.cs @@ -0,0 +1,16 @@ +using Refit; + +namespace PageFetcher.Models; + +/// +/// Refit client for the internal page-fetcher-api service. +/// All calls require the X-Internal-Api-Key header, configured at registration time. +/// +public interface IPageFetcherApiClient +{ + /// + /// Fetches a web page via headless Chromium and returns the rendered HTML and extracted plain text. + /// + [Post("/api/page/fetch")] + Task FetchAsync([Body] FetchPageRequest request, CancellationToken ct = default); +} diff --git a/Apis/page-fetcher-api-models/page-fetcher-api-models.csproj b/Apis/page-fetcher-api-models/page-fetcher-api-models.csproj new file mode 100644 index 0000000..da460be --- /dev/null +++ b/Apis/page-fetcher-api-models/page-fetcher-api-models.csproj @@ -0,0 +1,12 @@ + + + net10.0 + enable + enable + page-fetcher-api-models + PageFetcher.Models + + + + + diff --git a/Apis/page-fetcher-api/Controllers/PageController.cs b/Apis/page-fetcher-api/Controllers/PageController.cs new file mode 100644 index 0000000..99ca641 --- /dev/null +++ b/Apis/page-fetcher-api/Controllers/PageController.cs @@ -0,0 +1,47 @@ +using Microsoft.AspNetCore.Mvc; +using PageFetcher.Models; +using PageFetcherApi.Services; +using Swashbuckle.AspNetCore.Annotations; + +namespace PageFetcherApi.Controllers; + +/// +/// Handles page-fetch requests: navigates to the URL via Playwright and returns rendered HTML and extracted text. +/// +[ApiController] +[Route("api/page")] +public sealed class PageController : ControllerBase +{ + private readonly PageFetcherService _service; + private readonly ILogger _logger; + + public PageController(PageFetcherService service, ILogger logger) + { + _service = service; + _logger = logger; + } + + /// + /// Fetches a web page via headless Chromium. + /// Returns rendered HTML and extracted plain text. + /// + [HttpPost("fetch")] + [SwaggerOperation(Summary = "Fetch a web page", Description = "Navigates to the given URL using Playwright, returns rendered HTML and stripped plain text.")] + [SwaggerResponse(StatusCodes.Status200OK, "Page fetched successfully", typeof(FetchPageResponse))] + [SwaggerResponse(StatusCodes.Status400BadRequest, "Invalid or non-HTTP(S) URL")] + public async Task> Fetch([FromBody] FetchPageRequest request, CancellationToken ct) + { + if (string.IsNullOrWhiteSpace(request.Url)) + return BadRequest(new { Error = "Url is required." }); + + if (!Uri.TryCreate(request.Url, UriKind.Absolute, out var uri) || + (uri.Scheme != Uri.UriSchemeHttp && uri.Scheme != Uri.UriSchemeHttps)) + return BadRequest(new { Error = "Url must be an absolute HTTP or HTTPS URL." }); + + _logger.LogInformation("Fetch request: {Url} | caller={Caller} | waitFor={WaitFor}", + request.Url, request.CallerService, request.WaitFor); + + var result = await _service.FetchAsync(request, ct); + return Ok(result); + } +} diff --git a/Apis/page-fetcher-api/Dockerfile b/Apis/page-fetcher-api/Dockerfile new file mode 100644 index 0000000..3f9b8b2 --- /dev/null +++ b/Apis/page-fetcher-api/Dockerfile @@ -0,0 +1,50 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build +ARG BUILD_CONFIGURATION=Release +WORKDIR /src +COPY Directory.Packages.props ./ + +COPY Apis/page-fetcher-api/page-fetcher-api.csproj Apis/page-fetcher-api/ +COPY Apis/page-fetcher-data/page-fetcher-data.csproj Apis/page-fetcher-data/ +COPY Apis/page-fetcher-api-models/page-fetcher-api-models.csproj Apis/page-fetcher-api-models/ +COPY Apis/common/common.csproj Apis/common/ +COPY Apis/shared-data/shared-data.csproj Apis/shared-data/ +COPY Helpers/startup-helpers/startup-helpers.csproj Helpers/startup-helpers/ +COPY Helpers/common-helpers/common-helpers.csproj Helpers/common-helpers/ + +RUN dotnet restore Apis/page-fetcher-api/page-fetcher-api.csproj + +COPY Apis/page-fetcher-api/ Apis/page-fetcher-api/ +COPY Apis/page-fetcher-data/ Apis/page-fetcher-data/ +COPY Apis/page-fetcher-api-models/ Apis/page-fetcher-api-models/ +COPY Apis/common/ Apis/common/ +COPY Apis/shared-data/ Apis/shared-data/ +COPY Helpers/startup-helpers/ Helpers/startup-helpers/ +COPY Helpers/common-helpers/ Helpers/common-helpers/ + +RUN dotnet publish Apis/page-fetcher-api/page-fetcher-api.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false + +# Download Playwright Chromium browser in the build stage. +# Node.js is only needed here to run npx — it is not copied to the final image. +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \ + && npx --yes playwright@1.60.0 install chromium \ + && rm -rf /var/lib/apt/lists/* + +FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final +WORKDIR /app + +# System libraries required by Chromium on Debian bookworm +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \ + libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \ + libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \ + libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \ + && rm -rf /var/lib/apt/lists/* + +# Copy the Playwright Chromium browser from the build stage +ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright +COPY --from=build /ms-playwright /ms-playwright + +COPY --from=build /app/publish . + +ENTRYPOINT ["dotnet", "page-fetcher-api.dll"] diff --git a/Apis/page-fetcher-api/Program.cs b/Apis/page-fetcher-api/Program.cs new file mode 100644 index 0000000..2a97344 --- /dev/null +++ b/Apis/page-fetcher-api/Program.cs @@ -0,0 +1,74 @@ +using System.Reflection; +using Microsoft.EntityFrameworkCore; +using PageFetcher.Data; +using PageFetcherApi.Services; +using Serilog; +using StartupHelpers; + +StartupExtensions.LoadDotEnvFile(); + +const string ServiceName = "page-fetcher-api"; +var appVersion = StartupExtensions.GetApplicationVersion(Assembly.GetExecutingAssembly()); + +try +{ + var builder = WebApplication.CreateBuilder(args); + + builder.ConfigureJsonSerilog(ServiceName, appVersion); + Log.Information("Starting {Service} version {AppVersion}", ServiceName, appVersion); + + builder.AddAzureKeyVaultIfConfigured(); + + builder.Services.Configure(builder.Configuration.GetSection("PageFetcher")); + + builder.Services.AddDbContext(options => + { + var connectionString = builder.Services.GetConfiguredDbConnectionString(builder.Configuration); + options.UseSqlServer(connectionString, sql => + { + sql.MigrationsHistoryTable(PageFetchDbContext.MigrationTableName, PageFetchDbContext.SchemaName); + sql.MigrationsAssembly("page-fetcher-data"); + }); + }); + + // Playwright browser: singleton hosted service, shared across all requests + builder.Services.AddSingleton(); + builder.Services.AddHostedService(sp => sp.GetRequiredService()); + + builder.Services.AddScoped(); + + builder.Services.AddControllers(); + builder.Services.AddSwaggerWithXmlComments(Assembly.GetExecutingAssembly(), "Page Fetcher API"); + + var app = builder.Build(); + + app.LogStartupDiagnostics(ServiceName); + + app.UseDefaultSerilogRequestLogging(); + app.UseJsonExceptionHandler(ServiceName); + app.UseInternalApiKeyProtection(); + app.UseSwaggerInDevelopment("Page Fetcher API", "PageFetcherAPI"); + + app.UseRouting(); + app.UseAuthorization(); + app.MapControllers(); + + Log.Information("Running EF Core migrations if any"); + using (var scope = app.Services.CreateScope()) + { + var db = scope.ServiceProvider.GetRequiredService(); + db.Database.Migrate(); + } + + Log.Information("{Service} startup complete. Listening for requests...", ServiceName); + app.Run(); +} +catch (Exception ex) +{ + Log.Fatal(ex, "{Service} terminated unexpectedly", ServiceName); +} +finally +{ + Log.Information("Shutting down {Service}", ServiceName); + Log.CloseAndFlush(); +} diff --git a/Apis/page-fetcher-api/Properties/launchSettings.json b/Apis/page-fetcher-api/Properties/launchSettings.json new file mode 100644 index 0000000..c9995ec --- /dev/null +++ b/Apis/page-fetcher-api/Properties/launchSettings.json @@ -0,0 +1,12 @@ +{ + "profiles": { + "page-fetcher-api": { + "commandName": "Project", + "launchBrowser": true, + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "applicationUrl": "https://localhost:50268;http://localhost:50269" + } + } +} \ No newline at end of file diff --git a/Apis/page-fetcher-api/Services/PageFetcherService.cs b/Apis/page-fetcher-api/Services/PageFetcherService.cs new file mode 100644 index 0000000..7dae58b --- /dev/null +++ b/Apis/page-fetcher-api/Services/PageFetcherService.cs @@ -0,0 +1,143 @@ +using System.Diagnostics; +using System.Net; +using System.Text.RegularExpressions; +using Microsoft.Extensions.Options; +using Microsoft.Playwright; +using PageFetcher.Data; +using PageFetcher.Data.Entities; +using PageFetcher.Models; + +namespace PageFetcherApi.Services; + +/// +/// Fetches a web page via Playwright, extracts plain text, persists the result to the database, +/// and returns a . +/// +public sealed class PageFetcherService +{ + private readonly PlaywrightBrowserService _browserService; + private readonly PageFetchDbContext _db; + private readonly PageFetcherSettings _settings; + private readonly ILogger _logger; + + public PageFetcherService( + PlaywrightBrowserService browserService, + PageFetchDbContext db, + IOptions settings, + ILogger logger) + { + _browserService = browserService; + _db = db; + _settings = settings.Value; + _logger = logger; + } + + /// + /// Fetches the page at using Playwright, saves the fetch record, + /// and returns the HTML and extracted text. + /// Returns a failed response (with = false) rather than throwing + /// on network or navigation errors. + /// + public async Task FetchAsync(FetchPageRequest request, CancellationToken ct) + { + var sw = Stopwatch.StartNew(); + string html = string.Empty; + string text = string.Empty; + int? statusCode = null; + bool success = false; + string? errorMessage = null; + string finalUrl = request.Url; + + try + { + var page = await _browserService.Browser.NewPageAsync(); + await using var _ = page.ConfigureAwait(false); + + var waitUntil = request.WaitFor?.ToLowerInvariant() switch + { + "load" => WaitUntilState.Load, + "domcontentloaded" => WaitUntilState.DOMContentLoaded, + _ => WaitUntilState.NetworkIdle + }; + + IResponse? response; + try + { + response = await page.GotoAsync(request.Url, new PageGotoOptions + { + WaitUntil = waitUntil, + Timeout = _settings.TimeoutSeconds * 1_000 + }); + } + catch (TimeoutException) + { + _logger.LogWarning("Playwright NetworkIdle timeout for {Url}, using partial content", request.Url); + response = null; + } + + statusCode = response?.Status; + finalUrl = page.Url; + html = await page.ContentAsync(); + text = ExtractText(html); + success = true; + + _logger.LogInformation("Fetched {Url} → HTTP {Status} | HTML {HtmlLen} chars | text {TextLen} chars | {DurationMs} ms", + request.Url, statusCode?.ToString() ?? "timeout", html.Length, text.Length, sw.ElapsedMilliseconds); + } + catch (Exception ex) + { + errorMessage = ex.Message; + _logger.LogError(ex, "Failed to fetch {Url}", request.Url); + } + finally + { + sw.Stop(); + } + + // Persist fetch record + var entity = new PageFetchEntity + { + Id = Guid.NewGuid().ToString("N"), + Url = request.Url, + CallerService = request.CallerService ?? string.Empty, + HttpStatusCode = statusCode, + Html = html, + Text = text, + DurationMs = sw.ElapsedMilliseconds, + Success = success, + ErrorMessage = errorMessage + }; + + _db.PageFetches.Add(entity); + await _db.SaveChangesAsync(ct); + + return new FetchPageResponse + { + Url = finalUrl, + StatusCode = statusCode ?? 0, + Html = html, + Text = text, + Success = success, + Error = errorMessage + }; + } + + /// + /// Strips script/style blocks and all HTML tags from raw HTML, normalises whitespace, + /// and truncates to . + /// + private string ExtractText(string html) + { + if (string.IsNullOrWhiteSpace(html)) return string.Empty; + + var text = html; + text = Regex.Replace(text, "", " ", RegexOptions.IgnoreCase); + text = Regex.Replace(text, "", " ", RegexOptions.IgnoreCase); + text = Regex.Replace(text, "<[^>]+>", " "); + text = WebUtility.HtmlDecode(text); + text = string.Join(' ', text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim(); + + var max = Math.Max(4_000, _settings.MaxTextChars); + return text.Length <= max ? text : text[..max]; + } +} diff --git a/Apis/page-fetcher-api/Services/PageFetcherSettings.cs b/Apis/page-fetcher-api/Services/PageFetcherSettings.cs new file mode 100644 index 0000000..00621bb --- /dev/null +++ b/Apis/page-fetcher-api/Services/PageFetcherSettings.cs @@ -0,0 +1,17 @@ +namespace PageFetcherApi.Services; + +/// +/// Runtime settings for the page-fetcher service. +/// Bound from the PageFetcher configuration section. +/// +public sealed class PageFetcherSettings +{ + /// Default Playwright wait condition (networkidle, load, domcontentloaded). + public string DefaultWaitFor { get; set; } = "networkidle"; + + /// Page navigation timeout in seconds. + public int TimeoutSeconds { get; set; } = 30; + + /// Maximum characters stored/returned in the extracted text field. + public int MaxTextChars { get; set; } = 60_000; +} diff --git a/Apis/page-fetcher-api/Services/PlaywrightBrowserService.cs b/Apis/page-fetcher-api/Services/PlaywrightBrowserService.cs new file mode 100644 index 0000000..144dc73 --- /dev/null +++ b/Apis/page-fetcher-api/Services/PlaywrightBrowserService.cs @@ -0,0 +1,49 @@ +using Microsoft.Playwright; + +namespace PageFetcherApi.Services; + +/// +/// Singleton hosted service that owns the Playwright Chromium browser process for the lifetime of the application. +/// Launches the browser once at startup and exposes it for injection into . +/// +public sealed class PlaywrightBrowserService : IHostedService, IAsyncDisposable +{ + private IPlaywright? _playwright; + private IBrowser? _browser; + private readonly ILogger _logger; + + public PlaywrightBrowserService(ILogger logger) + { + _logger = logger; + } + + /// The running Chromium browser instance. Available after completes. + public IBrowser Browser => _browser ?? throw new InvalidOperationException("Browser has not been started yet."); + + /// + public async Task StartAsync(CancellationToken cancellationToken) + { + _logger.LogInformation("Launching Playwright Chromium browser..."); + _playwright = await Playwright.CreateAsync(); + _browser = await _playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions + { + Headless = true, + Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"] + }); + _logger.LogInformation("Playwright Chromium browser launched successfully."); + } + + /// + public async Task StopAsync(CancellationToken cancellationToken) + { + _logger.LogInformation("Closing Playwright Chromium browser..."); + if (_browser is not null) await _browser.CloseAsync(); + } + + /// + public async ValueTask DisposeAsync() + { + if (_browser is not null) await _browser.DisposeAsync(); + _playwright?.Dispose(); + } +} diff --git a/Apis/page-fetcher-api/appsettings.json b/Apis/page-fetcher-api/appsettings.json new file mode 100644 index 0000000..35a63ae --- /dev/null +++ b/Apis/page-fetcher-api/appsettings.json @@ -0,0 +1,73 @@ +{ + "Serilog": { + "Using": [ + "Serilog.Sinks.Console", + "Serilog.Sinks.File" + ], + "MinimumLevel": { + "Default": "Information", + "Override": { + "Microsoft.AspNetCore": "Warning", + "Microsoft.AspNetCore.Hosting": "Information", + "Microsoft.AspNetCore.Routing": "Warning", + "System.Net.Http.HttpClient": "Warning", + "PageFetcherApi": "Information" + } + }, + "WriteTo": [ + { + "Name": "Console", + "Args": { + "outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}" + } + }, + { + "Name": "File", + "Args": { + "path": "logs/page-fetcher-api-.log", + "rollingInterval": "Day", + "retainedFileCountLimit": 30, + "outputTemplate": "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}" + } + } + ], + "Enrich": [ + "FromLogContext", + "WithMachineName", + "WithEnvironmentName" + ] + }, + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "Microsoft.AspNetCore.Hosting": "Information", + "Microsoft.AspNetCore.Routing": "Warning", + "System.Net.Http.HttpClient": "Warning", + "PageFetcherApi": "Information" + } + }, + "LogEnvironmentOnStartup": true, + "AllowedHosts": "*", + "KeyVault": { + "VaultUri": "", + "Enabled": false + }, + "Database": { + "Host": "localhost", + "Port": 1433, + "Name": "MyAiDb", + "User": "sa", + "Password": "", + "TrustServerCertificate": true + }, + "InternalApi": { + "ApiKey": "", + "RequireApiKey": true + }, + "PageFetcher": { + "DefaultWaitFor": "networkidle", + "TimeoutSeconds": 30, + "MaxTextChars": 60000 + } +} diff --git a/Apis/page-fetcher-api/page-fetcher-api.csproj b/Apis/page-fetcher-api/page-fetcher-api.csproj new file mode 100644 index 0000000..40123f4 --- /dev/null +++ b/Apis/page-fetcher-api/page-fetcher-api.csproj @@ -0,0 +1,34 @@ + + + net10.0 + enable + enable + Linux + PageFetcherApi + true + $(NoWarn);1591 + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + + + + + + diff --git a/Apis/page-fetcher-data/Data/Entities/PageFetchEntity.cs b/Apis/page-fetcher-data/Data/Entities/PageFetchEntity.cs new file mode 100644 index 0000000..96ef3ef --- /dev/null +++ b/Apis/page-fetcher-data/Data/Entities/PageFetchEntity.cs @@ -0,0 +1,34 @@ +using Shared.Data.Entities; + +namespace PageFetcher.Data.Entities; + +/// +/// Audit record of a single page-fetch operation performed by the page-fetcher-api. +/// Stores the full rendered HTML and extracted plain text for every URL fetched. +/// +public sealed class PageFetchEntity : BaseEntity +{ + /// The URL that was requested. + public string Url { get; set; } = string.Empty; + + /// Name of the service that requested the fetch (e.g. cv-matcher-api, cv-search-job). + public string CallerService { get; set; } = string.Empty; + + /// HTTP status code returned by the remote server. null on network failure. + public int? HttpStatusCode { get; set; } + + /// Full rendered HTML as returned by Playwright. + public string Html { get; set; } = string.Empty; + + /// Plain text extracted from the HTML (script/style stripped, whitespace normalised). + public string Text { get; set; } = string.Empty; + + /// Playwright round-trip time in milliseconds. + public long DurationMs { get; set; } + + /// true when the page was fetched successfully; false on timeout or network error. + public bool Success { get; set; } + + /// Exception message when is false. + public string? ErrorMessage { get; set; } +} diff --git a/Apis/page-fetcher-data/MigrationConstants.cs b/Apis/page-fetcher-data/MigrationConstants.cs new file mode 100644 index 0000000..9f4d74b --- /dev/null +++ b/Apis/page-fetcher-data/MigrationConstants.cs @@ -0,0 +1,8 @@ +namespace PageFetcher.Data; + +/// Schema and migration-history table name constants for the pageFetcher EF schema. +public static class MigrationConstants +{ + public const string SchemaName = "pageFetcher"; + public const string MigrationTableName = "_Migrations"; +} diff --git a/Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.Designer.cs b/Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.Designer.cs new file mode 100644 index 0000000..a036246 --- /dev/null +++ b/Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.Designer.cs @@ -0,0 +1,82 @@ +// +using System; +using Microsoft.EntityFrameworkCore; +using Microsoft.EntityFrameworkCore.Infrastructure; +using Microsoft.EntityFrameworkCore.Metadata; +using Microsoft.EntityFrameworkCore.Migrations; +using Microsoft.EntityFrameworkCore.Storage.ValueConversion; +using PageFetcher.Data; + +#nullable disable + +namespace PageFetcher.Data.Migrations +{ + [DbContext(typeof(PageFetchDbContext))] + [Migration("20260608143523_InitialSchema")] + partial class InitialSchema + { + /// + protected override void BuildTargetModel(ModelBuilder modelBuilder) + { +#pragma warning disable 612, 618 + modelBuilder + .HasDefaultSchema("pageFetcher") + .HasAnnotation("ProductVersion", "10.0.7") + .HasAnnotation("Relational:MaxIdentifierLength", 128); + + SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); + + modelBuilder.Entity("PageFetcher.Data.Entities.PageFetchEntity", b => + { + b.Property("Id") + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CallerService") + .IsRequired() + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CreatedAt") + .ValueGeneratedOnAdd() + .HasColumnType("datetime2") + .HasDefaultValueSql("SYSUTCDATETIME()"); + + b.Property("DurationMs") + .HasColumnType("bigint"); + + b.Property("ErrorMessage") + .HasMaxLength(2000) + .HasColumnType("nvarchar(2000)"); + + b.Property("Html") + .IsRequired() + .HasColumnType("nvarchar(max)"); + + b.Property("HttpStatusCode") + .HasColumnType("int"); + + b.Property("Success") + .HasColumnType("bit"); + + b.Property("Text") + .IsRequired() + .HasColumnType("nvarchar(max)"); + + b.Property("Url") + .IsRequired() + .HasMaxLength(2000) + .HasColumnType("nvarchar(2000)"); + + b.HasKey("Id"); + + b.HasIndex("CreatedAt"); + + b.HasIndex("Url"); + + b.ToTable("PageFetches", "pageFetcher"); + }); +#pragma warning restore 612, 618 + } + } +} diff --git a/Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.cs b/Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.cs new file mode 100644 index 0000000..7f23daa --- /dev/null +++ b/Apis/page-fetcher-data/Migrations/20260608143523_InitialSchema.cs @@ -0,0 +1,59 @@ +using System; +using Microsoft.EntityFrameworkCore.Migrations; + +#nullable disable + +namespace PageFetcher.Data.Migrations +{ + /// + public partial class InitialSchema : Migration + { + /// + protected override void Up(MigrationBuilder migrationBuilder) + { + migrationBuilder.EnsureSchema( + name: MigrationConstants.SchemaName); + + migrationBuilder.CreateTable( + name: "PageFetches", + schema: MigrationConstants.SchemaName, + columns: table => new + { + Id = table.Column(type: "nvarchar(64)", maxLength: 64, nullable: false), + Url = table.Column(type: "nvarchar(2000)", maxLength: 2000, nullable: false), + CallerService = table.Column(type: "nvarchar(64)", maxLength: 64, nullable: false), + HttpStatusCode = table.Column(type: "int", nullable: true), + Html = table.Column(type: "nvarchar(max)", nullable: false), + Text = table.Column(type: "nvarchar(max)", nullable: false), + DurationMs = table.Column(type: "bigint", nullable: false), + Success = table.Column(type: "bit", nullable: false), + ErrorMessage = table.Column(type: "nvarchar(2000)", maxLength: 2000, nullable: true), + CreatedAt = table.Column(type: "datetime2", nullable: false, defaultValueSql: "SYSUTCDATETIME()") + }, + constraints: table => + { + table.PrimaryKey("PK_PageFetches", x => x.Id); + }); + + migrationBuilder.CreateIndex( + name: "IX_PageFetches_CreatedAt", + schema: MigrationConstants.SchemaName, + table: "PageFetches", + column: "CreatedAt"); + + migrationBuilder.CreateIndex( + name: "IX_PageFetches_Url", + schema: MigrationConstants.SchemaName, + table: "PageFetches", + column: "Url"); + } + + /// + protected override void Down(MigrationBuilder migrationBuilder) + { + migrationBuilder.DropTable( + name: "PageFetches", + schema: MigrationConstants.SchemaName); + } + } +} diff --git a/Apis/page-fetcher-data/Migrations/PageFetchDbContextModelSnapshot.cs b/Apis/page-fetcher-data/Migrations/PageFetchDbContextModelSnapshot.cs new file mode 100644 index 0000000..dd72094 --- /dev/null +++ b/Apis/page-fetcher-data/Migrations/PageFetchDbContextModelSnapshot.cs @@ -0,0 +1,79 @@ +// +using System; +using Microsoft.EntityFrameworkCore; +using Microsoft.EntityFrameworkCore.Infrastructure; +using Microsoft.EntityFrameworkCore.Metadata; +using Microsoft.EntityFrameworkCore.Storage.ValueConversion; +using PageFetcher.Data; + +#nullable disable + +namespace PageFetcher.Data.Migrations +{ + [DbContext(typeof(PageFetchDbContext))] + partial class PageFetchDbContextModelSnapshot : ModelSnapshot + { + protected override void BuildModel(ModelBuilder modelBuilder) + { +#pragma warning disable 612, 618 + modelBuilder + .HasDefaultSchema("pageFetcher") + .HasAnnotation("ProductVersion", "10.0.7") + .HasAnnotation("Relational:MaxIdentifierLength", 128); + + SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); + + modelBuilder.Entity("PageFetcher.Data.Entities.PageFetchEntity", b => + { + b.Property("Id") + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CallerService") + .IsRequired() + .HasMaxLength(64) + .HasColumnType("nvarchar(64)"); + + b.Property("CreatedAt") + .ValueGeneratedOnAdd() + .HasColumnType("datetime2") + .HasDefaultValueSql("SYSUTCDATETIME()"); + + b.Property("DurationMs") + .HasColumnType("bigint"); + + b.Property("ErrorMessage") + .HasMaxLength(2000) + .HasColumnType("nvarchar(2000)"); + + b.Property("Html") + .IsRequired() + .HasColumnType("nvarchar(max)"); + + b.Property("HttpStatusCode") + .HasColumnType("int"); + + b.Property("Success") + .HasColumnType("bit"); + + b.Property("Text") + .IsRequired() + .HasColumnType("nvarchar(max)"); + + b.Property("Url") + .IsRequired() + .HasMaxLength(2000) + .HasColumnType("nvarchar(2000)"); + + b.HasKey("Id"); + + b.HasIndex("CreatedAt"); + + b.HasIndex("Url"); + + b.ToTable("PageFetches", "pageFetcher"); + }); +#pragma warning restore 612, 618 + } + } +} diff --git a/Apis/page-fetcher-data/PageFetchDbContext.cs b/Apis/page-fetcher-data/PageFetchDbContext.cs new file mode 100644 index 0000000..5f9538f --- /dev/null +++ b/Apis/page-fetcher-data/PageFetchDbContext.cs @@ -0,0 +1,45 @@ +using Microsoft.EntityFrameworkCore; +using PageFetcher.Data.Entities; + +namespace PageFetcher.Data; + +/// +/// EF Core DbContext for the pageFetcher schema. +/// Owns the PageFetches audit table. +/// +public sealed class PageFetchDbContext : DbContext +{ + public const string SchemaName = MigrationConstants.SchemaName; + public const string MigrationTableName = MigrationConstants.MigrationTableName; + + public PageFetchDbContext(DbContextOptions options) : base(options) { } + + public DbSet PageFetches => Set(); + + protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder) + { + base.OnConfiguring(optionsBuilder); + optionsBuilder.UseSqlServer(x => x.MigrationsHistoryTable(MigrationTableName, SchemaName)); + } + + protected override void OnModelCreating(ModelBuilder modelBuilder) + { + modelBuilder.HasDefaultSchema(SchemaName); + + modelBuilder.Entity(entity => + { + entity.ToTable("PageFetches"); + entity.HasKey(x => x.Id); + entity.Property(x => x.Id).HasMaxLength(64); + entity.Property(x => x.Url).HasMaxLength(2000).IsRequired(); + entity.Property(x => x.CallerService).HasMaxLength(64).IsRequired(); + entity.Property(x => x.Html).IsRequired(); + entity.Property(x => x.Text).IsRequired(); + entity.Property(x => x.ErrorMessage).HasMaxLength(2000); + entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()"); + + entity.HasIndex(x => x.Url); + entity.HasIndex(x => x.CreatedAt); + }); + } +} diff --git a/Apis/page-fetcher-data/page-fetcher-data.csproj b/Apis/page-fetcher-data/page-fetcher-data.csproj new file mode 100644 index 0000000..2bf865a --- /dev/null +++ b/Apis/page-fetcher-data/page-fetcher-data.csproj @@ -0,0 +1,19 @@ + + + net10.0 + enable + enable + page-fetcher-data + PageFetcher.Data + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + diff --git a/Jobs/cv-search-job/Program.cs b/Jobs/cv-search-job/Program.cs index 2e593e6..d555d63 100644 --- a/Jobs/cv-search-job/Program.cs +++ b/Jobs/cv-search-job/Program.cs @@ -14,6 +14,7 @@ using JobScheduler.Tasks; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; +using PageFetcher.Models; using Refit; using Serilog; using Common.Settings; @@ -81,7 +82,19 @@ try client.DefaultRequestHeaders.Add("X-Internal-Api-Key", key); }); - builder.Services.AddHttpClient(); + builder.Services.AddRefitClient() + .ConfigureHttpClient((sp, client) => + { + var config = sp.GetRequiredService(); + var baseUrl = config["PageFetcherApi:BaseUrl"] ?? string.Empty; + if (!string.IsNullOrWhiteSpace(baseUrl)) + client.BaseAddress = new Uri(baseUrl.TrimEnd('/') + "/"); + var key = config["PageFetcherApi:InternalApiKey"]; + if (!string.IsNullOrWhiteSpace(key)) + client.DefaultRequestHeaders.Add("X-Internal-Api-Key", key); + }); + + builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); diff --git a/Jobs/cv-search-job/Services/HtmlJobSearcher.cs b/Jobs/cv-search-job/Services/HtmlJobSearcher.cs index a4e40f6..b4f7e01 100644 --- a/Jobs/cv-search-job/Services/HtmlJobSearcher.cs +++ b/Jobs/cv-search-job/Services/HtmlJobSearcher.cs @@ -1,36 +1,39 @@ using System.Text.RegularExpressions; using System.Web; using CvMatcher.Models.Settings; -using Microsoft.Playwright; +using PageFetcher.Models; using Microsoft.Extensions.Logging; namespace CvSearchJob.Services; /// -/// Config-driven HTML scraper that fetches a provider's job listing page and extracts matching job URLs. -/// Uses a two-stage anchor filter: href must contain the provider's link pattern, and anchor text must -/// contain at least one CV keyword. -/// Supports both plain HTTP GET (default) and headless Chromium rendering for JS-heavy SPAs. +/// A URL and its anchor text as scraped from a job listing search-results page. +/// +public sealed record JobCandidate(string Url, string Title); + +/// +/// Config-driven HTML scraper that fetches a provider's job listing page via page-fetcher-api +/// and extracts matching job URL candidates. +/// Uses a two-stage anchor filter: href must contain the provider's link pattern, and (optionally) +/// anchor text must contain at least one CV keyword. /// public sealed class HtmlJobSearcher { - private readonly HttpClient _http; + private readonly IPageFetcherApiClient _pageFetcher; private readonly ILogger _logger; - public HtmlJobSearcher(HttpClient http, ILogger logger) + public HtmlJobSearcher(IPageFetcherApiClient pageFetcher, ILogger logger) { - _http = http; + _pageFetcher = pageFetcher; _logger = logger; - _http.Timeout = TimeSpan.FromSeconds(20); - _http.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; MyAi.ro CV-Search/1.0)"); } /// - /// Fetches the provider's search result page for the combined initial + CV keywords, parses all anchor - /// tags, applies the two-stage filter, and returns up to absolute URLs. - /// Returns an empty list when the HTTP request fails rather than throwing. + /// Fetches the provider's search result page, parses all anchor tags, applies the two-stage filter, + /// and returns up to candidates (URL + title). + /// Returns an empty list when the page fetch fails rather than throwing. /// - public async Task> SearchJobUrlsAsync( + public async Task> SearchJobUrlsAsync( JobProviderConfig provider, IReadOnlyList cvKeywords, string? location, @@ -61,24 +64,29 @@ public sealed class HtmlJobSearcher .Replace("{location-slug}", locationSlug); _logger.LogInformation( - "Provider {Provider}: fetching {Url} [{Mode}] | CV keywords: [{Keywords}] | Location: {Location}", + "Provider {Provider}: fetching {Url} | CV keywords: [{Keywords}] | Location: {Location}", provider.Name, searchUrl, - provider.UseHeadlessBrowser ? "headless" : "http", string.Join(", ", cvKeywords), location ?? "(none)"); - string? html; - if (provider.UseHeadlessBrowser) - html = await FetchWithPlaywrightAsync(provider.Name, searchUrl, ct); - else - html = await FetchWithHttpAsync(provider.Name, searchUrl, ct); + var fetchResponse = await _pageFetcher.FetchAsync(new FetchPageRequest + { + Url = searchUrl, + WaitFor = provider.UseHeadlessBrowser ? "networkidle" : "domcontentloaded", + CallerService = "cv-search-job" + }, ct); - if (html is null) return []; + if (!fetchResponse.Success || string.IsNullOrWhiteSpace(fetchResponse.Html)) + { + _logger.LogWarning("Provider {Provider}: page fetch failed — {Error}", provider.Name, fetchResponse.Error); + return []; + } + var html = fetchResponse.Html; _logger.LogInformation("Provider {Provider}: received {Length} chars of HTML", provider.Name, html.Length); var baseUri = new Uri(searchUrl); - var results = new List(); + var results = new List(); var seen = new HashSet(StringComparer.OrdinalIgnoreCase); var anchorPattern = new Regex(@"]+href=[""']([^""']+)[""'][^>]*>(.*?)", @@ -123,7 +131,7 @@ public sealed class HtmlJobSearcher var url = absoluteUri.GetLeftPart(UriPartial.Path); if (seen.Add(url)) - results.Add(url); + results.Add(new JobCandidate(url, anchorText)); } _logger.LogInformation( @@ -132,61 +140,4 @@ public sealed class HtmlJobSearcher return results; } - - private async Task FetchWithHttpAsync(string providerName, string url, CancellationToken ct) - { - try - { - return await _http.GetStringAsync(url, ct); - } - catch (Exception ex) - { - _logger.LogError(ex, "Provider {Provider}: HTTP fetch failed for {Url}", providerName, url); - return null; - } - } - - private async Task FetchWithPlaywrightAsync(string providerName, string url, CancellationToken ct) - { - try - { - using var playwright = await Playwright.CreateAsync(); - await using var browser = await playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions - { - Headless = true, - Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"] - }); - - var page = await browser.NewPageAsync(); - - IResponse? response; - try - { - response = await page.GotoAsync(url, new PageGotoOptions - { - WaitUntil = WaitUntilState.NetworkIdle, - Timeout = 30_000 - }); - } - catch (TimeoutException) - { - // NetworkIdle timed out — use whatever content rendered so far - _logger.LogWarning("Provider {Provider}: Playwright NetworkIdle timeout for {Url}, using partial content", providerName, url); - return await page.ContentAsync(); - } - - if (response is null || response.Status >= 400) - { - _logger.LogWarning("Provider {Provider}: Playwright got HTTP {Status} for {Url}", providerName, response?.Status, url); - return null; - } - - return await page.ContentAsync(); - } - catch (Exception ex) - { - _logger.LogError(ex, "Provider {Provider}: Playwright fetch failed for {Url}", providerName, url); - return null; - } - } } diff --git a/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs b/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs index f867d1c..9c87383 100644 --- a/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs +++ b/Jobs/cv-search-job/Tasks/CvSearchJobTask.cs @@ -11,6 +11,7 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +using PageFetcher.Models; namespace CvSearchJob.Tasks; @@ -24,6 +25,7 @@ public sealed class CvSearchJobTask : IJobTask private readonly JobSearchSettings _settings; private readonly HtmlJobSearcher _searcher; private readonly ICvMatcherInternalApi _matcherApi; + private readonly IPageFetcherApiClient _pageFetcher; private readonly CvSearchEmailSender _emailSender; private readonly ILogger _logger; @@ -34,6 +36,7 @@ public sealed class CvSearchJobTask : IJobTask IOptions settings, HtmlJobSearcher searcher, ICvMatcherInternalApi matcherApi, + IPageFetcherApiClient pageFetcher, CvSearchEmailSender emailSender, ILogger logger) { @@ -41,6 +44,7 @@ public sealed class CvSearchJobTask : IJobTask _settings = settings.Value; _searcher = searcher; _matcherApi = matcherApi; + _pageFetcher = pageFetcher; _emailSender = emailSender; _logger = logger; } @@ -126,7 +130,8 @@ public sealed class CvSearchJobTask : IJobTask /// /// Runs the full search pipeline for a session: scrapes all providers, deduplicates URLs, - /// scores each candidate via the matcher API, and persists results that meet the minimum score threshold. + /// fetches each individual job page via page-fetcher-api, applies a keyword pre-filter, + /// scores passing candidates via the matcher API, and persists results that meet the minimum score threshold. /// private async Task> RunSearchAsync( JobSearchSessionEntity session, @@ -138,30 +143,59 @@ public sealed class CvSearchJobTask : IJobTask if (cvKeywords.Count == 0) _logger.LogWarning("Session {SessionId}: keyword list is empty — scraper will rely on provider InitialKeywords only", session.Id); - var jobUrls = new HashSet(StringComparer.OrdinalIgnoreCase); + var jobCandidates = new Dictionary(StringComparer.OrdinalIgnoreCase); // url → title foreach (var provider in providers) { - var urls = await _searcher.SearchJobUrlsAsync(provider, cvKeywords, session.Location, ct); - _logger.LogInformation("Session {SessionId}: provider {Provider} returned {Count} URLs", session.Id, provider.Name, urls.Count); - foreach (var url in urls) jobUrls.Add(url); + var candidates = await _searcher.SearchJobUrlsAsync(provider, cvKeywords, session.Location, ct); + _logger.LogInformation("Session {SessionId}: provider {Provider} returned {Count} candidates", session.Id, provider.Name, candidates.Count); + foreach (var c in candidates) + jobCandidates.TryAdd(c.Url, c.Title); } - var candidates = jobUrls.Take(_settings.MaxJobsToMatch).ToList(); + var deduped = jobCandidates.Take(_settings.MaxJobsToMatch).ToList(); _logger.LogInformation( - "Session {SessionId}: {Total} unique URLs across all providers, scoring {Scoring} (cap={Cap})", - session.Id, jobUrls.Count, candidates.Count, _settings.MaxJobsToMatch); + "Session {SessionId}: {Total} unique URLs across all providers, processing up to {Cap}", + session.Id, jobCandidates.Count, deduped.Count); var results = new List(); - foreach (var url in candidates) + foreach (var (url, title) in deduped) { try { + // Fetch individual job page text via page-fetcher-api + var fetchResponse = await _pageFetcher.FetchAsync(new FetchPageRequest + { + Url = url, + WaitFor = "domcontentloaded", + CallerService = "cv-search-job" + }, ct); + + if (!fetchResponse.Success || string.IsNullOrWhiteSpace(fetchResponse.Text)) + { + _logger.LogWarning("Session {SessionId}: fetch failed for {Url} — {Error}", session.Id, url, fetchResponse.Error); + continue; + } + + var jobText = fetchResponse.Text; + + // Keyword pre-filter: skip LLM call if no CV keyword appears in the job page text + if (cvKeywords.Count > 0 && + !cvKeywords.Any(k => jobText.Contains(k, StringComparison.OrdinalIgnoreCase))) + { + _logger.LogInformation( + "Session {SessionId}: pre-filter skip | {Url} | no CV keyword found in job text", + session.Id, url); + continue; + } + var matchRequest = new MatchJobRequest { CvDocumentId = session.CvDocumentId, JobUrl = url, + // Pre-fetched text passed directly so cv-matcher-api skips re-fetching the page + JobDescription = jobText, // User already gave GDPR consent when they clicked the one-time job search link GdprConsent = true }; @@ -182,7 +216,7 @@ public sealed class CvSearchJobTask : IJobTask SessionId = session.Id, ProviderName = GuessProvider(url, providers), JobUrl = url, - JobTitle = matchResult.Summary.Split('.').FirstOrDefault()?.Trim() ?? "Job", + JobTitle = matchResult.Summary.Split('.').FirstOrDefault()?.Trim() ?? title, JobText = string.Empty, Score = matchResult.Score, ResultJson = JsonSerializer.Serialize(matchResult, new JsonSerializerOptions(JsonSerializerDefaults.Web)), diff --git a/Jobs/cv-search-job/cv-search-job.csproj b/Jobs/cv-search-job/cv-search-job.csproj index 8a94a55..2cefbb7 100644 --- a/Jobs/cv-search-job/cv-search-job.csproj +++ b/Jobs/cv-search-job/cv-search-job.csproj @@ -13,7 +13,6 @@ - @@ -26,6 +25,7 @@ + diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index b0e40f9..fb81454 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -72,6 +72,9 @@ services: - RagApi__BaseUrl=${RagApi__BaseUrl:-http://rag-api:8080} - RagApi__InternalApiKey=${RagApi__InternalApiKey:-} + - PageFetcherApi__BaseUrl=${PageFetcherApi__BaseUrl:-http://myai-page-fetcher-api:8080} + - PageFetcherApi__InternalApiKey=${PageFetcherApi__InternalApiKey:-} + - Ai__Provider=${Ai__Provider:-OpenAI} - Ai__OpenAI__ApiKey=${Ai__OpenAI__ApiKey:-} - Ai__OpenAI__ChatModel=${Ai__OpenAI__ChatModel:-gpt-4o-mini} @@ -266,6 +269,9 @@ services: - EmailApi__BaseUrl=${EmailApi__BaseUrl:-http://email-api:8080} - EmailApi__InternalApiKey=${EmailApi__InternalApiKey:-} + - PageFetcherApi__BaseUrl=${PageFetcherApi__BaseUrl:-http://myai-page-fetcher-api:8080} + - PageFetcherApi__InternalApiKey=${PageFetcherApi__InternalApiKey:-} + - FileStorage__Path=${FileStorage__Path:-Files} - JobSearch__Enabled=${JobSearch__Enabled:-true} @@ -293,6 +299,38 @@ services: labels: - "com.centurylinklabs.watchtower.enable=true" + page-fetcher-api: + image: registry.easysoft.ro/apps/myai-page-fetcher-api:${IMAGE_TAG:-staging} + container_name: myai-page-fetcher-api + environment: + - ASPNETCORE_ENVIRONMENT=${ASPNETCORE_ENVIRONMENT:-Staging} + - ASPNETCORE_URLS=${ASPNETCORE_URLS:-http://+:8080} + - APP_ENVIRONMENT_NAME=${APP_ENVIRONMENT_NAME:-myai.staging} + + - Database__Host=${Database__Host:-sqlserver} + - Database__Port=${Database__Port:-1433} + - Database__Name=${Database__Name:-MyAiDb} + - Database__User=${Database__User:-sa} + - Database__Password=${Database__Password:-} + - Database__TrustServerCertificate=${Database__TrustServerCertificate:-true} + + - InternalApi__ApiKey=${PageFetcherApi__InternalApiKey:-} + - InternalApi__RequireApiKey=true + + - SerilogEmail__From=${SerilogEmail__From:-} + - SerilogEmail__To=${SerilogEmail__To:-} + - SerilogEmail__Host=${SerilogEmail__Host:-} + - SerilogEmail__Port=${SerilogEmail__Port:-587} + - SerilogEmail__UserName=${SerilogEmail__UserName:-} + - SerilogEmail__Password=${SerilogEmail__Password:-} + volumes: + - ${LOGS_PATH:-/opt/myai/logs}/page-fetcher-api:/app/logs + networks: + - myai-network + restart: unless-stopped + labels: + - "com.centurylinklabs.watchtower.enable=true" + web: image: registry.easysoft.ro/apps/myai-web:${IMAGE_TAG:-staging} container_name: myai-web diff --git a/myAi.sln b/myAi.sln index 8ee6a22..c79c65f 100644 --- a/myAi.sln +++ b/myAi.sln @@ -63,6 +63,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "email-api", "Apis\email-api EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "email-data", "Apis\email-data\email-data.csproj", "{C1D2E3F4-A5B6-4789-CDEF-012345678ABC}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "page-fetcher-api-models", "Apis\page-fetcher-api-models\page-fetcher-api-models.csproj", "{4F1A669E-C8AF-428F-87E7-3E0A213DD20B}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "page-fetcher-data", "Apis\page-fetcher-data\page-fetcher-data.csproj", "{06F803CD-329D-40C2-B62D-0F14E137D3C7}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "page-fetcher-api", "Apis\page-fetcher-api\page-fetcher-api.csproj", "{FC5A722A-7B12-459E-AB9F-0A724797783E}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -357,6 +363,42 @@ Global {C1D2E3F4-A5B6-4789-CDEF-012345678ABC}.Release|x64.Build.0 = Release|Any CPU {C1D2E3F4-A5B6-4789-CDEF-012345678ABC}.Release|x86.ActiveCfg = Release|Any CPU {C1D2E3F4-A5B6-4789-CDEF-012345678ABC}.Release|x86.Build.0 = Release|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x64.ActiveCfg = Debug|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x64.Build.0 = Debug|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x86.ActiveCfg = Debug|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Debug|x86.Build.0 = Debug|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|Any CPU.Build.0 = Release|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x64.ActiveCfg = Release|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x64.Build.0 = Release|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x86.ActiveCfg = Release|Any CPU + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B}.Release|x86.Build.0 = Release|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|Any CPU.Build.0 = Debug|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x64.ActiveCfg = Debug|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x64.Build.0 = Debug|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x86.ActiveCfg = Debug|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Debug|x86.Build.0 = Debug|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|Any CPU.ActiveCfg = Release|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|Any CPU.Build.0 = Release|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x64.ActiveCfg = Release|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x64.Build.0 = Release|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x86.ActiveCfg = Release|Any CPU + {06F803CD-329D-40C2-B62D-0F14E137D3C7}.Release|x86.Build.0 = Release|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x64.ActiveCfg = Debug|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x64.Build.0 = Debug|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x86.ActiveCfg = Debug|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Debug|x86.Build.0 = Debug|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|Any CPU.Build.0 = Release|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x64.ActiveCfg = Release|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x64.Build.0 = Release|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x86.ActiveCfg = Release|Any CPU + {FC5A722A-7B12-459E-AB9F-0A724797783E}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -385,6 +427,9 @@ Global {BE44B4EB-9AB9-4D81-A9BF-5CF2832BEEE5} = {A9B8C7D6-E5F4-4321-ABCD-FEDCBA987654} {434119EA-2FFC-4433-9B8E-1E6D94006413} = {0FE6558F-2157-47F2-A835-558416CE0E2B} {C1D2E3F4-A5B6-4789-CDEF-012345678ABC} = {D4E5F6A7-B8C9-4012-3456-789ABCDEF012} + {4F1A669E-C8AF-428F-87E7-3E0A213DD20B} = {0FE6558F-2157-47F2-A835-558416CE0E2B} + {06F803CD-329D-40C2-B62D-0F14E137D3C7} = {0FE6558F-2157-47F2-A835-558416CE0E2B} + {FC5A722A-7B12-459E-AB9F-0A724797783E} = {0FE6558F-2157-47F2-A835-558416CE0E2B} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {6246A67B-299E-4E64-8DBE-1A66771E7C67}