feat: add page-fetcher-api — centralised Playwright page fetcher

Introduces page-fetcher-api, a new internal ASP.NET Core service that
centralises all web-page fetching through a single Playwright (headless
Chromium) browser instance. All fetches are persisted to the pageFetcher
SQL schema for auditing.

New projects:
- Apis/page-fetcher-api-models: FetchPageRequest, FetchPageResponse, IPageFetcherApiClient
- Apis/page-fetcher-data: PageFetchDbContext, PageFetchEntity, InitialSchema migration (schema: pageFetcher)
- Apis/page-fetcher-api: PlaywrightBrowserService (singleton), PageFetcherService, PageController

Changes to existing services:
- cv-matcher-api: JobTextExtractor now calls IPageFetcherApiClient instead of HttpClient
- cv-search-job: HtmlJobSearcher uses IPageFetcherApiClient (removes inline Playwright);
  CvSearchJobTask fetches individual job pages and applies keyword pre-filter before
  LLM call; passes pre-fetched JobDescription to cv-matcher-api to skip re-fetch
- common: add PageFetcherApiSettings
- docker-compose.yml, build.yml: add new service + env vars for callers

Closes #43

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-08 17:43:56 +03:00
parent 1222a86eb7
commit 898dd09d50
31 changed files with 1121 additions and 110 deletions
@@ -0,0 +1,47 @@
using Microsoft.AspNetCore.Mvc;
using PageFetcher.Models;
using PageFetcherApi.Services;
using Swashbuckle.AspNetCore.Annotations;
namespace PageFetcherApi.Controllers;
/// <summary>
/// Handles page-fetch requests: navigates to the URL via Playwright and returns rendered HTML and extracted text.
/// </summary>
[ApiController]
[Route("api/page")]
public sealed class PageController : ControllerBase
{
private readonly PageFetcherService _service;
private readonly ILogger<PageController> _logger;
public PageController(PageFetcherService service, ILogger<PageController> logger)
{
_service = service;
_logger = logger;
}
/// <summary>
/// Fetches a web page via headless Chromium.
/// Returns rendered HTML and extracted plain text.
/// </summary>
[HttpPost("fetch")]
[SwaggerOperation(Summary = "Fetch a web page", Description = "Navigates to the given URL using Playwright, returns rendered HTML and stripped plain text.")]
[SwaggerResponse(StatusCodes.Status200OK, "Page fetched successfully", typeof(FetchPageResponse))]
[SwaggerResponse(StatusCodes.Status400BadRequest, "Invalid or non-HTTP(S) URL")]
public async Task<ActionResult<FetchPageResponse>> Fetch([FromBody] FetchPageRequest request, CancellationToken ct)
{
if (string.IsNullOrWhiteSpace(request.Url))
return BadRequest(new { Error = "Url is required." });
if (!Uri.TryCreate(request.Url, UriKind.Absolute, out var uri) ||
(uri.Scheme != Uri.UriSchemeHttp && uri.Scheme != Uri.UriSchemeHttps))
return BadRequest(new { Error = "Url must be an absolute HTTP or HTTPS URL." });
_logger.LogInformation("Fetch request: {Url} | caller={Caller} | waitFor={WaitFor}",
request.Url, request.CallerService, request.WaitFor);
var result = await _service.FetchAsync(request, ct);
return Ok(result);
}
}
+50
View File
@@ -0,0 +1,50 @@
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
ARG BUILD_CONFIGURATION=Release
WORKDIR /src
COPY Directory.Packages.props ./
COPY Apis/page-fetcher-api/page-fetcher-api.csproj Apis/page-fetcher-api/
COPY Apis/page-fetcher-data/page-fetcher-data.csproj Apis/page-fetcher-data/
COPY Apis/page-fetcher-api-models/page-fetcher-api-models.csproj Apis/page-fetcher-api-models/
COPY Apis/common/common.csproj Apis/common/
COPY Apis/shared-data/shared-data.csproj Apis/shared-data/
COPY Helpers/startup-helpers/startup-helpers.csproj Helpers/startup-helpers/
COPY Helpers/common-helpers/common-helpers.csproj Helpers/common-helpers/
RUN dotnet restore Apis/page-fetcher-api/page-fetcher-api.csproj
COPY Apis/page-fetcher-api/ Apis/page-fetcher-api/
COPY Apis/page-fetcher-data/ Apis/page-fetcher-data/
COPY Apis/page-fetcher-api-models/ Apis/page-fetcher-api-models/
COPY Apis/common/ Apis/common/
COPY Apis/shared-data/ Apis/shared-data/
COPY Helpers/startup-helpers/ Helpers/startup-helpers/
COPY Helpers/common-helpers/ Helpers/common-helpers/
RUN dotnet publish Apis/page-fetcher-api/page-fetcher-api.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
# Download Playwright Chromium browser in the build stage.
# Node.js is only needed here to run npx — it is not copied to the final image.
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \
&& npx --yes playwright@1.60.0 install chromium \
&& rm -rf /var/lib/apt/lists/*
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final
WORKDIR /app
# System libraries required by Chromium on Debian bookworm
RUN apt-get update && apt-get install -y --no-install-recommends \
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \
libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \
&& rm -rf /var/lib/apt/lists/*
# Copy the Playwright Chromium browser from the build stage
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
COPY --from=build /ms-playwright /ms-playwright
COPY --from=build /app/publish .
ENTRYPOINT ["dotnet", "page-fetcher-api.dll"]
+74
View File
@@ -0,0 +1,74 @@
using System.Reflection;
using Microsoft.EntityFrameworkCore;
using PageFetcher.Data;
using PageFetcherApi.Services;
using Serilog;
using StartupHelpers;
StartupExtensions.LoadDotEnvFile();
const string ServiceName = "page-fetcher-api";
var appVersion = StartupExtensions.GetApplicationVersion(Assembly.GetExecutingAssembly());
try
{
var builder = WebApplication.CreateBuilder(args);
builder.ConfigureJsonSerilog(ServiceName, appVersion);
Log.Information("Starting {Service} version {AppVersion}", ServiceName, appVersion);
builder.AddAzureKeyVaultIfConfigured();
builder.Services.Configure<PageFetcherSettings>(builder.Configuration.GetSection("PageFetcher"));
builder.Services.AddDbContext<PageFetchDbContext>(options =>
{
var connectionString = builder.Services.GetConfiguredDbConnectionString(builder.Configuration);
options.UseSqlServer(connectionString, sql =>
{
sql.MigrationsHistoryTable(PageFetchDbContext.MigrationTableName, PageFetchDbContext.SchemaName);
sql.MigrationsAssembly("page-fetcher-data");
});
});
// Playwright browser: singleton hosted service, shared across all requests
builder.Services.AddSingleton<PlaywrightBrowserService>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<PlaywrightBrowserService>());
builder.Services.AddScoped<PageFetcherService>();
builder.Services.AddControllers();
builder.Services.AddSwaggerWithXmlComments(Assembly.GetExecutingAssembly(), "Page Fetcher API");
var app = builder.Build();
app.LogStartupDiagnostics(ServiceName);
app.UseDefaultSerilogRequestLogging();
app.UseJsonExceptionHandler(ServiceName);
app.UseInternalApiKeyProtection();
app.UseSwaggerInDevelopment("Page Fetcher API", "PageFetcherAPI");
app.UseRouting();
app.UseAuthorization();
app.MapControllers();
Log.Information("Running EF Core migrations if any");
using (var scope = app.Services.CreateScope())
{
var db = scope.ServiceProvider.GetRequiredService<PageFetchDbContext>();
db.Database.Migrate();
}
Log.Information("{Service} startup complete. Listening for requests...", ServiceName);
app.Run();
}
catch (Exception ex)
{
Log.Fatal(ex, "{Service} terminated unexpectedly", ServiceName);
}
finally
{
Log.Information("Shutting down {Service}", ServiceName);
Log.CloseAndFlush();
}
@@ -0,0 +1,12 @@
{
"profiles": {
"page-fetcher-api": {
"commandName": "Project",
"launchBrowser": true,
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
},
"applicationUrl": "https://localhost:50268;http://localhost:50269"
}
}
}
@@ -0,0 +1,143 @@
using System.Diagnostics;
using System.Net;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Options;
using Microsoft.Playwright;
using PageFetcher.Data;
using PageFetcher.Data.Entities;
using PageFetcher.Models;
namespace PageFetcherApi.Services;
/// <summary>
/// Fetches a web page via Playwright, extracts plain text, persists the result to the database,
/// and returns a <see cref="FetchPageResponse"/>.
/// </summary>
public sealed class PageFetcherService
{
private readonly PlaywrightBrowserService _browserService;
private readonly PageFetchDbContext _db;
private readonly PageFetcherSettings _settings;
private readonly ILogger<PageFetcherService> _logger;
public PageFetcherService(
PlaywrightBrowserService browserService,
PageFetchDbContext db,
IOptions<PageFetcherSettings> settings,
ILogger<PageFetcherService> logger)
{
_browserService = browserService;
_db = db;
_settings = settings.Value;
_logger = logger;
}
/// <summary>
/// Fetches the page at <paramref name="request.Url"/> using Playwright, saves the fetch record,
/// and returns the HTML and extracted text.
/// Returns a failed response (with <see cref="FetchPageResponse.Success"/> = false) rather than throwing
/// on network or navigation errors.
/// </summary>
public async Task<FetchPageResponse> FetchAsync(FetchPageRequest request, CancellationToken ct)
{
var sw = Stopwatch.StartNew();
string html = string.Empty;
string text = string.Empty;
int? statusCode = null;
bool success = false;
string? errorMessage = null;
string finalUrl = request.Url;
try
{
var page = await _browserService.Browser.NewPageAsync();
await using var _ = page.ConfigureAwait(false);
var waitUntil = request.WaitFor?.ToLowerInvariant() switch
{
"load" => WaitUntilState.Load,
"domcontentloaded" => WaitUntilState.DOMContentLoaded,
_ => WaitUntilState.NetworkIdle
};
IResponse? response;
try
{
response = await page.GotoAsync(request.Url, new PageGotoOptions
{
WaitUntil = waitUntil,
Timeout = _settings.TimeoutSeconds * 1_000
});
}
catch (TimeoutException)
{
_logger.LogWarning("Playwright NetworkIdle timeout for {Url}, using partial content", request.Url);
response = null;
}
statusCode = response?.Status;
finalUrl = page.Url;
html = await page.ContentAsync();
text = ExtractText(html);
success = true;
_logger.LogInformation("Fetched {Url} → HTTP {Status} | HTML {HtmlLen} chars | text {TextLen} chars | {DurationMs} ms",
request.Url, statusCode?.ToString() ?? "timeout", html.Length, text.Length, sw.ElapsedMilliseconds);
}
catch (Exception ex)
{
errorMessage = ex.Message;
_logger.LogError(ex, "Failed to fetch {Url}", request.Url);
}
finally
{
sw.Stop();
}
// Persist fetch record
var entity = new PageFetchEntity
{
Id = Guid.NewGuid().ToString("N"),
Url = request.Url,
CallerService = request.CallerService ?? string.Empty,
HttpStatusCode = statusCode,
Html = html,
Text = text,
DurationMs = sw.ElapsedMilliseconds,
Success = success,
ErrorMessage = errorMessage
};
_db.PageFetches.Add(entity);
await _db.SaveChangesAsync(ct);
return new FetchPageResponse
{
Url = finalUrl,
StatusCode = statusCode ?? 0,
Html = html,
Text = text,
Success = success,
Error = errorMessage
};
}
/// <summary>
/// Strips script/style blocks and all HTML tags from raw HTML, normalises whitespace,
/// and truncates to <see cref="PageFetcherSettings.MaxTextChars"/>.
/// </summary>
private string ExtractText(string html)
{
if (string.IsNullOrWhiteSpace(html)) return string.Empty;
var text = html;
text = Regex.Replace(text, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<[^>]+>", " ");
text = WebUtility.HtmlDecode(text);
text = string.Join(' ', text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
var max = Math.Max(4_000, _settings.MaxTextChars);
return text.Length <= max ? text : text[..max];
}
}
@@ -0,0 +1,17 @@
namespace PageFetcherApi.Services;
/// <summary>
/// Runtime settings for the page-fetcher service.
/// Bound from the <c>PageFetcher</c> configuration section.
/// </summary>
public sealed class PageFetcherSettings
{
/// <summary>Default Playwright wait condition (<c>networkidle</c>, <c>load</c>, <c>domcontentloaded</c>).</summary>
public string DefaultWaitFor { get; set; } = "networkidle";
/// <summary>Page navigation timeout in seconds.</summary>
public int TimeoutSeconds { get; set; } = 30;
/// <summary>Maximum characters stored/returned in the extracted text field.</summary>
public int MaxTextChars { get; set; } = 60_000;
}
@@ -0,0 +1,49 @@
using Microsoft.Playwright;
namespace PageFetcherApi.Services;
/// <summary>
/// Singleton hosted service that owns the Playwright Chromium browser process for the lifetime of the application.
/// Launches the browser once at startup and exposes it for injection into <see cref="PageFetcherService"/>.
/// </summary>
public sealed class PlaywrightBrowserService : IHostedService, IAsyncDisposable
{
private IPlaywright? _playwright;
private IBrowser? _browser;
private readonly ILogger<PlaywrightBrowserService> _logger;
public PlaywrightBrowserService(ILogger<PlaywrightBrowserService> logger)
{
_logger = logger;
}
/// <summary>The running Chromium browser instance. Available after <see cref="StartAsync"/> completes.</summary>
public IBrowser Browser => _browser ?? throw new InvalidOperationException("Browser has not been started yet.");
/// <inheritdoc />
public async Task StartAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Launching Playwright Chromium browser...");
_playwright = await Playwright.CreateAsync();
_browser = await _playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
{
Headless = true,
Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
});
_logger.LogInformation("Playwright Chromium browser launched successfully.");
}
/// <inheritdoc />
public async Task StopAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Closing Playwright Chromium browser...");
if (_browser is not null) await _browser.CloseAsync();
}
/// <inheritdoc />
public async ValueTask DisposeAsync()
{
if (_browser is not null) await _browser.DisposeAsync();
_playwright?.Dispose();
}
}
+73
View File
@@ -0,0 +1,73 @@
{
"Serilog": {
"Using": [
"Serilog.Sinks.Console",
"Serilog.Sinks.File"
],
"MinimumLevel": {
"Default": "Information",
"Override": {
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Hosting": "Information",
"Microsoft.AspNetCore.Routing": "Warning",
"System.Net.Http.HttpClient": "Warning",
"PageFetcherApi": "Information"
}
},
"WriteTo": [
{
"Name": "Console",
"Args": {
"outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}"
}
},
{
"Name": "File",
"Args": {
"path": "logs/page-fetcher-api-.log",
"rollingInterval": "Day",
"retainedFileCountLimit": 30,
"outputTemplate": "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}"
}
}
],
"Enrich": [
"FromLogContext",
"WithMachineName",
"WithEnvironmentName"
]
},
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Hosting": "Information",
"Microsoft.AspNetCore.Routing": "Warning",
"System.Net.Http.HttpClient": "Warning",
"PageFetcherApi": "Information"
}
},
"LogEnvironmentOnStartup": true,
"AllowedHosts": "*",
"KeyVault": {
"VaultUri": "",
"Enabled": false
},
"Database": {
"Host": "localhost",
"Port": 1433,
"Name": "MyAiDb",
"User": "sa",
"Password": "",
"TrustServerCertificate": true
},
"InternalApi": {
"ApiKey": "",
"RequireApiKey": true
},
"PageFetcher": {
"DefaultWaitFor": "networkidle",
"TimeoutSeconds": 30,
"MaxTextChars": 60000
}
}
@@ -0,0 +1,34 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
<RootNamespace>PageFetcherApi</RootNamespace>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<NoWarn>$(NoWarn);1591</NoWarn>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Playwright" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Serilog.AspNetCore" />
<PackageReference Include="Serilog.Enrichers.Environment" />
<PackageReference Include="Serilog.Sinks.Console" />
<PackageReference Include="Serilog.Sinks.File" />
<PackageReference Include="Serilog.Sinks.Email" />
<PackageReference Include="Swashbuckle.AspNetCore" />
<PackageReference Include="Swashbuckle.AspNetCore.Annotations" />
<PackageReference Include="DotNetEnv" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\page-fetcher-data\page-fetcher-data.csproj" />
<ProjectReference Include="..\page-fetcher-api-models\page-fetcher-api-models.csproj" />
<ProjectReference Include="..\common\common.csproj" />
<ProjectReference Include="..\..\Helpers\startup-helpers\startup-helpers.csproj" />
<ProjectReference Include="..\..\Helpers\common-helpers\common-helpers.csproj" />
</ItemGroup>
</Project>