feat: add page-fetcher-api — centralised Playwright page fetcher

Introduces page-fetcher-api, a new internal ASP.NET Core service that
centralises all web-page fetching through a single Playwright (headless
Chromium) browser instance. All fetches are persisted to the pageFetcher
SQL schema for auditing.

New projects:
- Apis/page-fetcher-api-models: FetchPageRequest, FetchPageResponse, IPageFetcherApiClient
- Apis/page-fetcher-data: PageFetchDbContext, PageFetchEntity, InitialSchema migration (schema: pageFetcher)
- Apis/page-fetcher-api: PlaywrightBrowserService (singleton), PageFetcherService, PageController

Changes to existing services:
- cv-matcher-api: JobTextExtractor now calls IPageFetcherApiClient instead of HttpClient
- cv-search-job: HtmlJobSearcher uses IPageFetcherApiClient (removes inline Playwright);
  CvSearchJobTask fetches individual job pages and applies keyword pre-filter before
  LLM call; passes pre-fetched JobDescription to cv-matcher-api to skip re-fetch
- common: add PageFetcherApiSettings
- docker-compose.yml, build.yml: add new service + env vars for callers

Closes #43

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-08 17:43:56 +03:00
parent 1222a86eb7
commit 898dd09d50
31 changed files with 1121 additions and 110 deletions
@@ -0,0 +1,11 @@
namespace Common.Settings;
/// <summary>
/// Connection settings for the internal page-fetcher-api service.
/// Bound from the <c>PageFetcherApi</c> configuration section.
/// </summary>
public sealed class PageFetcherApiSettings
{
public string BaseUrl { get; set; } = string.Empty;
public string InternalApiKey { get; set; } = string.Empty;
}
+12 -1
View File
@@ -13,6 +13,7 @@ using Microsoft.EntityFrameworkCore;
using Refit;
using Serilog;
using Common.Settings;
using PageFetcher.Models;
using StartupHelpers;
using System.Reflection;
@@ -36,6 +37,16 @@ try
builder.Services.Configure<CvMatcher.Models.Settings.AiSettings>(builder.Configuration.GetSection("Ai"));
builder.Services.Configure<MatcherSettings>(builder.Configuration.GetSection("Matcher"));
builder.Services.Configure<JobSearchSettings>(builder.Configuration.GetSection("JobSearch"));
builder.Services.Configure<PageFetcherApiSettings>(builder.Configuration.GetSection("PageFetcherApi"));
builder.Services.AddRefitClient<IPageFetcherApiClient>()
.ConfigureHttpClient((sp, c) =>
{
var settings = sp.GetRequiredService<Microsoft.Extensions.Options.IOptions<PageFetcherApiSettings>>().Value;
c.BaseAddress = new Uri(settings.BaseUrl.TrimEnd('/') + "/");
if (!string.IsNullOrWhiteSpace(settings.InternalApiKey))
c.DefaultRequestHeaders.Add("X-Internal-Api-Key", settings.InternalApiKey);
});
builder.Services.AddRefitClient<IRefitRagApi>()
.ConfigureHttpClient((sp, c) =>
@@ -50,7 +61,7 @@ try
builder.Services.AddScoped<IRagApiClient, RagApiClient>();
builder.Services.AddHttpClient<IMatcherAiClient, MatcherAiClient>();
builder.Services.AddHttpClient<IJobTextExtractor, JobTextExtractor>();
builder.Services.AddScoped<IJobTextExtractor, JobTextExtractor>();
builder.Services.AddDbContext<CvMatcherDbContext>(options =>
{
@@ -1,26 +1,23 @@
using System.Net;
using System.Text.RegularExpressions;
using CvMatcher.Models.Settings;
using Api.Services.Contracts;
using Microsoft.Extensions.Options;
using PageFetcher.Models;
namespace Api.Services;
/// <summary>
/// Extracts normalised plain text from a job posting, either from a pasted description or by
/// fetching and stripping the HTML of the job page URL.
/// fetching the job page text via <c>page-fetcher-api</c> (headless Chromium rendering).
/// </summary>
public sealed class JobTextExtractor : IJobTextExtractor
{
private readonly HttpClient _http;
private readonly IPageFetcherApiClient _pageFetcher;
private readonly MatcherSettings _settings;
public JobTextExtractor(HttpClient http, IOptions<MatcherSettings> options)
public JobTextExtractor(IPageFetcherApiClient pageFetcher, IOptions<MatcherSettings> options)
{
_http = http;
_pageFetcher = pageFetcher;
_settings = options.Value;
_http.Timeout = TimeSpan.FromSeconds(25);
_http.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0");
}
/// <inheritdoc />
@@ -31,15 +28,18 @@ public sealed class JobTextExtractor : IJobTextExtractor
if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty;
if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || uri.Scheme is not ("http" or "https"))
{
throw new InvalidOperationException("Invalid job URL.");
}
var html = await _http.GetStringAsync(uri, ct);
html = Regex.Replace(html, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<[^>]+>", " ");
return Limit(Normalize(WebUtility.HtmlDecode(html)));
var response = await _pageFetcher.FetchAsync(new FetchPageRequest
{
Url = jobUrl,
CallerService = "cv-matcher-api"
}, ct);
if (!response.Success)
throw new InvalidOperationException($"Failed to fetch job page: {response.Error}");
return Limit(Normalize(response.Text));
}
/// <summary>Truncates text to the configured maximum character count.</summary>
@@ -82,6 +82,7 @@
<ProjectReference Include="..\cv-search-data\cv-search-data.csproj" />
<ProjectReference Include="..\cv-matcher-data\cv-matcher-data.csproj" />
<ProjectReference Include="..\common\common.csproj" />
<ProjectReference Include="..\page-fetcher-api-models\page-fetcher-api-models.csproj" />
<ProjectReference Include="..\..\Helpers\startup-helpers\startup-helpers.csproj" />
</ItemGroup>
</Project>
@@ -0,0 +1,20 @@
namespace PageFetcher.Models;
/// <summary>
/// Request to fetch a web page via the page-fetcher-api.
/// </summary>
public sealed class FetchPageRequest
{
/// <summary>Absolute HTTP or HTTPS URL to fetch.</summary>
public string Url { get; set; } = string.Empty;
/// <summary>
/// Playwright wait condition. Accepted values: <c>networkidle</c> (default), <c>domcontentloaded</c>, <c>load</c>.
/// </summary>
public string WaitFor { get; set; } = "networkidle";
/// <summary>
/// Identifies the calling service for audit purposes (e.g. <c>cv-matcher-api</c>, <c>cv-search-job</c>).
/// </summary>
public string CallerService { get; set; } = string.Empty;
}
@@ -0,0 +1,25 @@
namespace PageFetcher.Models;
/// <summary>
/// Result of a page fetch operation.
/// </summary>
public sealed class FetchPageResponse
{
/// <summary>Final URL after any redirects.</summary>
public string Url { get; set; } = string.Empty;
/// <summary>HTTP status code returned by the page. <c>0</c> on network failure.</summary>
public int StatusCode { get; set; }
/// <summary>Full rendered HTML as returned by Playwright.</summary>
public string Html { get; set; } = string.Empty;
/// <summary>Plain text extracted from the HTML (script/style stripped, whitespace normalised).</summary>
public string Text { get; set; } = string.Empty;
/// <summary>Whether the fetch succeeded. <c>false</c> on timeout or network error.</summary>
public bool Success { get; set; }
/// <summary>Exception message when <see cref="Success"/> is <c>false</c>.</summary>
public string? Error { get; set; }
}
@@ -0,0 +1,16 @@
using Refit;
namespace PageFetcher.Models;
/// <summary>
/// Refit client for the internal page-fetcher-api service.
/// All calls require the <c>X-Internal-Api-Key</c> header, configured at registration time.
/// </summary>
public interface IPageFetcherApiClient
{
/// <summary>
/// Fetches a web page via headless Chromium and returns the rendered HTML and extracted plain text.
/// </summary>
[Post("/api/page/fetch")]
Task<FetchPageResponse> FetchAsync([Body] FetchPageRequest request, CancellationToken ct = default);
}
@@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AssemblyName>page-fetcher-api-models</AssemblyName>
<RootNamespace>PageFetcher.Models</RootNamespace>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Refit.HttpClientFactory" />
</ItemGroup>
</Project>
@@ -0,0 +1,47 @@
using Microsoft.AspNetCore.Mvc;
using PageFetcher.Models;
using PageFetcherApi.Services;
using Swashbuckle.AspNetCore.Annotations;
namespace PageFetcherApi.Controllers;
/// <summary>
/// Handles page-fetch requests: navigates to the URL via Playwright and returns rendered HTML and extracted text.
/// </summary>
[ApiController]
[Route("api/page")]
public sealed class PageController : ControllerBase
{
private readonly PageFetcherService _service;
private readonly ILogger<PageController> _logger;
public PageController(PageFetcherService service, ILogger<PageController> logger)
{
_service = service;
_logger = logger;
}
/// <summary>
/// Fetches a web page via headless Chromium.
/// Returns rendered HTML and extracted plain text.
/// </summary>
[HttpPost("fetch")]
[SwaggerOperation(Summary = "Fetch a web page", Description = "Navigates to the given URL using Playwright, returns rendered HTML and stripped plain text.")]
[SwaggerResponse(StatusCodes.Status200OK, "Page fetched successfully", typeof(FetchPageResponse))]
[SwaggerResponse(StatusCodes.Status400BadRequest, "Invalid or non-HTTP(S) URL")]
public async Task<ActionResult<FetchPageResponse>> Fetch([FromBody] FetchPageRequest request, CancellationToken ct)
{
if (string.IsNullOrWhiteSpace(request.Url))
return BadRequest(new { Error = "Url is required." });
if (!Uri.TryCreate(request.Url, UriKind.Absolute, out var uri) ||
(uri.Scheme != Uri.UriSchemeHttp && uri.Scheme != Uri.UriSchemeHttps))
return BadRequest(new { Error = "Url must be an absolute HTTP or HTTPS URL." });
_logger.LogInformation("Fetch request: {Url} | caller={Caller} | waitFor={WaitFor}",
request.Url, request.CallerService, request.WaitFor);
var result = await _service.FetchAsync(request, ct);
return Ok(result);
}
}
+50
View File
@@ -0,0 +1,50 @@
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
ARG BUILD_CONFIGURATION=Release
WORKDIR /src
COPY Directory.Packages.props ./
COPY Apis/page-fetcher-api/page-fetcher-api.csproj Apis/page-fetcher-api/
COPY Apis/page-fetcher-data/page-fetcher-data.csproj Apis/page-fetcher-data/
COPY Apis/page-fetcher-api-models/page-fetcher-api-models.csproj Apis/page-fetcher-api-models/
COPY Apis/common/common.csproj Apis/common/
COPY Apis/shared-data/shared-data.csproj Apis/shared-data/
COPY Helpers/startup-helpers/startup-helpers.csproj Helpers/startup-helpers/
COPY Helpers/common-helpers/common-helpers.csproj Helpers/common-helpers/
RUN dotnet restore Apis/page-fetcher-api/page-fetcher-api.csproj
COPY Apis/page-fetcher-api/ Apis/page-fetcher-api/
COPY Apis/page-fetcher-data/ Apis/page-fetcher-data/
COPY Apis/page-fetcher-api-models/ Apis/page-fetcher-api-models/
COPY Apis/common/ Apis/common/
COPY Apis/shared-data/ Apis/shared-data/
COPY Helpers/startup-helpers/ Helpers/startup-helpers/
COPY Helpers/common-helpers/ Helpers/common-helpers/
RUN dotnet publish Apis/page-fetcher-api/page-fetcher-api.csproj -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
# Download Playwright Chromium browser in the build stage.
# Node.js is only needed here to run npx — it is not copied to the final image.
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm \
&& npx --yes playwright@1.60.0 install chromium \
&& rm -rf /var/lib/apt/lists/*
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS final
WORKDIR /app
# System libraries required by Chromium on Debian bookworm
RUN apt-get update && apt-get install -y --no-install-recommends \
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libatspi2.0-0 \
libwayland-client0 libx11-xcb1 libx11-6 libxcb1 libxext6 \
&& rm -rf /var/lib/apt/lists/*
# Copy the Playwright Chromium browser from the build stage
ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
COPY --from=build /ms-playwright /ms-playwright
COPY --from=build /app/publish .
ENTRYPOINT ["dotnet", "page-fetcher-api.dll"]
+74
View File
@@ -0,0 +1,74 @@
using System.Reflection;
using Microsoft.EntityFrameworkCore;
using PageFetcher.Data;
using PageFetcherApi.Services;
using Serilog;
using StartupHelpers;
StartupExtensions.LoadDotEnvFile();
const string ServiceName = "page-fetcher-api";
var appVersion = StartupExtensions.GetApplicationVersion(Assembly.GetExecutingAssembly());
try
{
var builder = WebApplication.CreateBuilder(args);
builder.ConfigureJsonSerilog(ServiceName, appVersion);
Log.Information("Starting {Service} version {AppVersion}", ServiceName, appVersion);
builder.AddAzureKeyVaultIfConfigured();
builder.Services.Configure<PageFetcherSettings>(builder.Configuration.GetSection("PageFetcher"));
builder.Services.AddDbContext<PageFetchDbContext>(options =>
{
var connectionString = builder.Services.GetConfiguredDbConnectionString(builder.Configuration);
options.UseSqlServer(connectionString, sql =>
{
sql.MigrationsHistoryTable(PageFetchDbContext.MigrationTableName, PageFetchDbContext.SchemaName);
sql.MigrationsAssembly("page-fetcher-data");
});
});
// Playwright browser: singleton hosted service, shared across all requests
builder.Services.AddSingleton<PlaywrightBrowserService>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<PlaywrightBrowserService>());
builder.Services.AddScoped<PageFetcherService>();
builder.Services.AddControllers();
builder.Services.AddSwaggerWithXmlComments(Assembly.GetExecutingAssembly(), "Page Fetcher API");
var app = builder.Build();
app.LogStartupDiagnostics(ServiceName);
app.UseDefaultSerilogRequestLogging();
app.UseJsonExceptionHandler(ServiceName);
app.UseInternalApiKeyProtection();
app.UseSwaggerInDevelopment("Page Fetcher API", "PageFetcherAPI");
app.UseRouting();
app.UseAuthorization();
app.MapControllers();
Log.Information("Running EF Core migrations if any");
using (var scope = app.Services.CreateScope())
{
var db = scope.ServiceProvider.GetRequiredService<PageFetchDbContext>();
db.Database.Migrate();
}
Log.Information("{Service} startup complete. Listening for requests...", ServiceName);
app.Run();
}
catch (Exception ex)
{
Log.Fatal(ex, "{Service} terminated unexpectedly", ServiceName);
}
finally
{
Log.Information("Shutting down {Service}", ServiceName);
Log.CloseAndFlush();
}
@@ -0,0 +1,12 @@
{
"profiles": {
"page-fetcher-api": {
"commandName": "Project",
"launchBrowser": true,
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
},
"applicationUrl": "https://localhost:50268;http://localhost:50269"
}
}
}
@@ -0,0 +1,143 @@
using System.Diagnostics;
using System.Net;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Options;
using Microsoft.Playwright;
using PageFetcher.Data;
using PageFetcher.Data.Entities;
using PageFetcher.Models;
namespace PageFetcherApi.Services;
/// <summary>
/// Fetches a web page via Playwright, extracts plain text, persists the result to the database,
/// and returns a <see cref="FetchPageResponse"/>.
/// </summary>
public sealed class PageFetcherService
{
private readonly PlaywrightBrowserService _browserService;
private readonly PageFetchDbContext _db;
private readonly PageFetcherSettings _settings;
private readonly ILogger<PageFetcherService> _logger;
public PageFetcherService(
PlaywrightBrowserService browserService,
PageFetchDbContext db,
IOptions<PageFetcherSettings> settings,
ILogger<PageFetcherService> logger)
{
_browserService = browserService;
_db = db;
_settings = settings.Value;
_logger = logger;
}
/// <summary>
/// Fetches the page at <paramref name="request.Url"/> using Playwright, saves the fetch record,
/// and returns the HTML and extracted text.
/// Returns a failed response (with <see cref="FetchPageResponse.Success"/> = false) rather than throwing
/// on network or navigation errors.
/// </summary>
public async Task<FetchPageResponse> FetchAsync(FetchPageRequest request, CancellationToken ct)
{
var sw = Stopwatch.StartNew();
string html = string.Empty;
string text = string.Empty;
int? statusCode = null;
bool success = false;
string? errorMessage = null;
string finalUrl = request.Url;
try
{
var page = await _browserService.Browser.NewPageAsync();
await using var _ = page.ConfigureAwait(false);
var waitUntil = request.WaitFor?.ToLowerInvariant() switch
{
"load" => WaitUntilState.Load,
"domcontentloaded" => WaitUntilState.DOMContentLoaded,
_ => WaitUntilState.NetworkIdle
};
IResponse? response;
try
{
response = await page.GotoAsync(request.Url, new PageGotoOptions
{
WaitUntil = waitUntil,
Timeout = _settings.TimeoutSeconds * 1_000
});
}
catch (TimeoutException)
{
_logger.LogWarning("Playwright NetworkIdle timeout for {Url}, using partial content", request.Url);
response = null;
}
statusCode = response?.Status;
finalUrl = page.Url;
html = await page.ContentAsync();
text = ExtractText(html);
success = true;
_logger.LogInformation("Fetched {Url} → HTTP {Status} | HTML {HtmlLen} chars | text {TextLen} chars | {DurationMs} ms",
request.Url, statusCode?.ToString() ?? "timeout", html.Length, text.Length, sw.ElapsedMilliseconds);
}
catch (Exception ex)
{
errorMessage = ex.Message;
_logger.LogError(ex, "Failed to fetch {Url}", request.Url);
}
finally
{
sw.Stop();
}
// Persist fetch record
var entity = new PageFetchEntity
{
Id = Guid.NewGuid().ToString("N"),
Url = request.Url,
CallerService = request.CallerService ?? string.Empty,
HttpStatusCode = statusCode,
Html = html,
Text = text,
DurationMs = sw.ElapsedMilliseconds,
Success = success,
ErrorMessage = errorMessage
};
_db.PageFetches.Add(entity);
await _db.SaveChangesAsync(ct);
return new FetchPageResponse
{
Url = finalUrl,
StatusCode = statusCode ?? 0,
Html = html,
Text = text,
Success = success,
Error = errorMessage
};
}
/// <summary>
/// Strips script/style blocks and all HTML tags from raw HTML, normalises whitespace,
/// and truncates to <see cref="PageFetcherSettings.MaxTextChars"/>.
/// </summary>
private string ExtractText(string html)
{
if (string.IsNullOrWhiteSpace(html)) return string.Empty;
var text = html;
text = Regex.Replace(text, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<[^>]+>", " ");
text = WebUtility.HtmlDecode(text);
text = string.Join(' ', text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
var max = Math.Max(4_000, _settings.MaxTextChars);
return text.Length <= max ? text : text[..max];
}
}
@@ -0,0 +1,17 @@
namespace PageFetcherApi.Services;
/// <summary>
/// Runtime settings for the page-fetcher service.
/// Bound from the <c>PageFetcher</c> configuration section.
/// </summary>
public sealed class PageFetcherSettings
{
/// <summary>Default Playwright wait condition (<c>networkidle</c>, <c>load</c>, <c>domcontentloaded</c>).</summary>
public string DefaultWaitFor { get; set; } = "networkidle";
/// <summary>Page navigation timeout in seconds.</summary>
public int TimeoutSeconds { get; set; } = 30;
/// <summary>Maximum characters stored/returned in the extracted text field.</summary>
public int MaxTextChars { get; set; } = 60_000;
}
@@ -0,0 +1,49 @@
using Microsoft.Playwright;
namespace PageFetcherApi.Services;
/// <summary>
/// Singleton hosted service that owns the Playwright Chromium browser process for the lifetime of the application.
/// Launches the browser once at startup and exposes it for injection into <see cref="PageFetcherService"/>.
/// </summary>
public sealed class PlaywrightBrowserService : IHostedService, IAsyncDisposable
{
private IPlaywright? _playwright;
private IBrowser? _browser;
private readonly ILogger<PlaywrightBrowserService> _logger;
public PlaywrightBrowserService(ILogger<PlaywrightBrowserService> logger)
{
_logger = logger;
}
/// <summary>The running Chromium browser instance. Available after <see cref="StartAsync"/> completes.</summary>
public IBrowser Browser => _browser ?? throw new InvalidOperationException("Browser has not been started yet.");
/// <inheritdoc />
public async Task StartAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Launching Playwright Chromium browser...");
_playwright = await Playwright.CreateAsync();
_browser = await _playwright.Chromium.LaunchAsync(new BrowserTypeLaunchOptions
{
Headless = true,
Args = ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
});
_logger.LogInformation("Playwright Chromium browser launched successfully.");
}
/// <inheritdoc />
public async Task StopAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Closing Playwright Chromium browser...");
if (_browser is not null) await _browser.CloseAsync();
}
/// <inheritdoc />
public async ValueTask DisposeAsync()
{
if (_browser is not null) await _browser.DisposeAsync();
_playwright?.Dispose();
}
}
+73
View File
@@ -0,0 +1,73 @@
{
"Serilog": {
"Using": [
"Serilog.Sinks.Console",
"Serilog.Sinks.File"
],
"MinimumLevel": {
"Default": "Information",
"Override": {
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Hosting": "Information",
"Microsoft.AspNetCore.Routing": "Warning",
"System.Net.Http.HttpClient": "Warning",
"PageFetcherApi": "Information"
}
},
"WriteTo": [
{
"Name": "Console",
"Args": {
"outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}"
}
},
{
"Name": "File",
"Args": {
"path": "logs/page-fetcher-api-.log",
"rollingInterval": "Day",
"retainedFileCountLimit": 30,
"outputTemplate": "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}"
}
}
],
"Enrich": [
"FromLogContext",
"WithMachineName",
"WithEnvironmentName"
]
},
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Hosting": "Information",
"Microsoft.AspNetCore.Routing": "Warning",
"System.Net.Http.HttpClient": "Warning",
"PageFetcherApi": "Information"
}
},
"LogEnvironmentOnStartup": true,
"AllowedHosts": "*",
"KeyVault": {
"VaultUri": "",
"Enabled": false
},
"Database": {
"Host": "localhost",
"Port": 1433,
"Name": "MyAiDb",
"User": "sa",
"Password": "",
"TrustServerCertificate": true
},
"InternalApi": {
"ApiKey": "",
"RequireApiKey": true
},
"PageFetcher": {
"DefaultWaitFor": "networkidle",
"TimeoutSeconds": 30,
"MaxTextChars": 60000
}
}
@@ -0,0 +1,34 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
<RootNamespace>PageFetcherApi</RootNamespace>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<NoWarn>$(NoWarn);1591</NoWarn>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Playwright" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Serilog.AspNetCore" />
<PackageReference Include="Serilog.Enrichers.Environment" />
<PackageReference Include="Serilog.Sinks.Console" />
<PackageReference Include="Serilog.Sinks.File" />
<PackageReference Include="Serilog.Sinks.Email" />
<PackageReference Include="Swashbuckle.AspNetCore" />
<PackageReference Include="Swashbuckle.AspNetCore.Annotations" />
<PackageReference Include="DotNetEnv" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\page-fetcher-data\page-fetcher-data.csproj" />
<ProjectReference Include="..\page-fetcher-api-models\page-fetcher-api-models.csproj" />
<ProjectReference Include="..\common\common.csproj" />
<ProjectReference Include="..\..\Helpers\startup-helpers\startup-helpers.csproj" />
<ProjectReference Include="..\..\Helpers\common-helpers\common-helpers.csproj" />
</ItemGroup>
</Project>
@@ -0,0 +1,34 @@
using Shared.Data.Entities;
namespace PageFetcher.Data.Entities;
/// <summary>
/// Audit record of a single page-fetch operation performed by the page-fetcher-api.
/// Stores the full rendered HTML and extracted plain text for every URL fetched.
/// </summary>
public sealed class PageFetchEntity : BaseEntity
{
/// <summary>The URL that was requested.</summary>
public string Url { get; set; } = string.Empty;
/// <summary>Name of the service that requested the fetch (e.g. <c>cv-matcher-api</c>, <c>cv-search-job</c>).</summary>
public string CallerService { get; set; } = string.Empty;
/// <summary>HTTP status code returned by the remote server. <c>null</c> on network failure.</summary>
public int? HttpStatusCode { get; set; }
/// <summary>Full rendered HTML as returned by Playwright.</summary>
public string Html { get; set; } = string.Empty;
/// <summary>Plain text extracted from the HTML (script/style stripped, whitespace normalised).</summary>
public string Text { get; set; } = string.Empty;
/// <summary>Playwright round-trip time in milliseconds.</summary>
public long DurationMs { get; set; }
/// <summary><c>true</c> when the page was fetched successfully; <c>false</c> on timeout or network error.</summary>
public bool Success { get; set; }
/// <summary>Exception message when <see cref="Success"/> is <c>false</c>.</summary>
public string? ErrorMessage { get; set; }
}
@@ -0,0 +1,8 @@
namespace PageFetcher.Data;
/// <summary>Schema and migration-history table name constants for the pageFetcher EF schema.</summary>
public static class MigrationConstants
{
public const string SchemaName = "pageFetcher";
public const string MigrationTableName = "_Migrations";
}
@@ -0,0 +1,82 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Metadata;
using Microsoft.EntityFrameworkCore.Migrations;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using PageFetcher.Data;
#nullable disable
namespace PageFetcher.Data.Migrations
{
[DbContext(typeof(PageFetchDbContext))]
[Migration("20260608143523_InitialSchema")]
partial class InitialSchema
{
/// <inheritdoc />
protected override void BuildTargetModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder
.HasDefaultSchema("pageFetcher")
.HasAnnotation("ProductVersion", "10.0.7")
.HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("PageFetcher.Data.Entities.PageFetchEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("CallerService")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<long>("DurationMs")
.HasColumnType("bigint");
b.Property<string>("ErrorMessage")
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.Property<string>("Html")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<int?>("HttpStatusCode")
.HasColumnType("int");
b.Property<bool>("Success")
.HasColumnType("bit");
b.Property<string>("Text")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<string>("Url")
.IsRequired()
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.HasKey("Id");
b.HasIndex("CreatedAt");
b.HasIndex("Url");
b.ToTable("PageFetches", "pageFetcher");
});
#pragma warning restore 612, 618
}
}
}
@@ -0,0 +1,59 @@
using System;
using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable
namespace PageFetcher.Data.Migrations
{
/// <inheritdoc />
public partial class InitialSchema : Migration
{
/// <inheritdoc />
protected override void Up(MigrationBuilder migrationBuilder)
{
migrationBuilder.EnsureSchema(
name: MigrationConstants.SchemaName);
migrationBuilder.CreateTable(
name: "PageFetches",
schema: MigrationConstants.SchemaName,
columns: table => new
{
Id = table.Column<string>(type: "nvarchar(64)", maxLength: 64, nullable: false),
Url = table.Column<string>(type: "nvarchar(2000)", maxLength: 2000, nullable: false),
CallerService = table.Column<string>(type: "nvarchar(64)", maxLength: 64, nullable: false),
HttpStatusCode = table.Column<int>(type: "int", nullable: true),
Html = table.Column<string>(type: "nvarchar(max)", nullable: false),
Text = table.Column<string>(type: "nvarchar(max)", nullable: false),
DurationMs = table.Column<long>(type: "bigint", nullable: false),
Success = table.Column<bool>(type: "bit", nullable: false),
ErrorMessage = table.Column<string>(type: "nvarchar(2000)", maxLength: 2000, nullable: true),
CreatedAt = table.Column<DateTime>(type: "datetime2", nullable: false, defaultValueSql: "SYSUTCDATETIME()")
},
constraints: table =>
{
table.PrimaryKey("PK_PageFetches", x => x.Id);
});
migrationBuilder.CreateIndex(
name: "IX_PageFetches_CreatedAt",
schema: MigrationConstants.SchemaName,
table: "PageFetches",
column: "CreatedAt");
migrationBuilder.CreateIndex(
name: "IX_PageFetches_Url",
schema: MigrationConstants.SchemaName,
table: "PageFetches",
column: "Url");
}
/// <inheritdoc />
protected override void Down(MigrationBuilder migrationBuilder)
{
migrationBuilder.DropTable(
name: "PageFetches",
schema: MigrationConstants.SchemaName);
}
}
}
@@ -0,0 +1,79 @@
// <auto-generated />
using System;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Infrastructure;
using Microsoft.EntityFrameworkCore.Metadata;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using PageFetcher.Data;
#nullable disable
namespace PageFetcher.Data.Migrations
{
[DbContext(typeof(PageFetchDbContext))]
partial class PageFetchDbContextModelSnapshot : ModelSnapshot
{
protected override void BuildModel(ModelBuilder modelBuilder)
{
#pragma warning disable 612, 618
modelBuilder
.HasDefaultSchema("pageFetcher")
.HasAnnotation("ProductVersion", "10.0.7")
.HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("PageFetcher.Data.Entities.PageFetchEntity", b =>
{
b.Property<string>("Id")
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<string>("CallerService")
.IsRequired()
.HasMaxLength(64)
.HasColumnType("nvarchar(64)");
b.Property<DateTime>("CreatedAt")
.ValueGeneratedOnAdd()
.HasColumnType("datetime2")
.HasDefaultValueSql("SYSUTCDATETIME()");
b.Property<long>("DurationMs")
.HasColumnType("bigint");
b.Property<string>("ErrorMessage")
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.Property<string>("Html")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<int?>("HttpStatusCode")
.HasColumnType("int");
b.Property<bool>("Success")
.HasColumnType("bit");
b.Property<string>("Text")
.IsRequired()
.HasColumnType("nvarchar(max)");
b.Property<string>("Url")
.IsRequired()
.HasMaxLength(2000)
.HasColumnType("nvarchar(2000)");
b.HasKey("Id");
b.HasIndex("CreatedAt");
b.HasIndex("Url");
b.ToTable("PageFetches", "pageFetcher");
});
#pragma warning restore 612, 618
}
}
}
@@ -0,0 +1,45 @@
using Microsoft.EntityFrameworkCore;
using PageFetcher.Data.Entities;
namespace PageFetcher.Data;
/// <summary>
/// EF Core DbContext for the <c>pageFetcher</c> schema.
/// Owns the <c>PageFetches</c> audit table.
/// </summary>
public sealed class PageFetchDbContext : DbContext
{
public const string SchemaName = MigrationConstants.SchemaName;
public const string MigrationTableName = MigrationConstants.MigrationTableName;
public PageFetchDbContext(DbContextOptions<PageFetchDbContext> options) : base(options) { }
public DbSet<PageFetchEntity> PageFetches => Set<PageFetchEntity>();
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
base.OnConfiguring(optionsBuilder);
optionsBuilder.UseSqlServer(x => x.MigrationsHistoryTable(MigrationTableName, SchemaName));
}
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.HasDefaultSchema(SchemaName);
modelBuilder.Entity<PageFetchEntity>(entity =>
{
entity.ToTable("PageFetches");
entity.HasKey(x => x.Id);
entity.Property(x => x.Id).HasMaxLength(64);
entity.Property(x => x.Url).HasMaxLength(2000).IsRequired();
entity.Property(x => x.CallerService).HasMaxLength(64).IsRequired();
entity.Property(x => x.Html).IsRequired();
entity.Property(x => x.Text).IsRequired();
entity.Property(x => x.ErrorMessage).HasMaxLength(2000);
entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()");
entity.HasIndex(x => x.Url);
entity.HasIndex(x => x.CreatedAt);
});
}
}
@@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AssemblyName>page-fetcher-data</AssemblyName>
<RootNamespace>PageFetcher.Data</RootNamespace>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\shared-data\shared-data.csproj" />
</ItemGroup>
</Project>