diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml
index 77e6f41..850a436 100644
--- a/.gitea/workflows/build.yml
+++ b/.gitea/workflows/build.yml
@@ -15,6 +15,7 @@ env:
WEB_IMAGE: apps/myai-web
CV_CLEANUP_JOB_IMAGE: apps/myai-cv-cleanup-job
CV_SEARCH_JOB_IMAGE: apps/myai-cv-search-job
+ PAGE_FETCHER_API_IMAGE: apps/myai-page-fetcher-api
IMAGE_TAG: staging
jobs:
@@ -62,6 +63,10 @@ jobs:
run: |
docker build -f Jobs/cv-search-job/Dockerfile -t "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}" .
+ - name: Build Page Fetcher API image
+ run: |
+ docker build -f Apis/page-fetcher-api/Dockerfile -t "${REGISTRY_HOST}/${PAGE_FETCHER_API_IMAGE}:${IMAGE_TAG}" .
+
- name: Push API image
run: |
docker push "${REGISTRY_HOST}/${API_IMAGE}:${IMAGE_TAG}"
@@ -88,4 +93,8 @@ jobs:
- name: Push CV search job image
run: |
- docker push "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}"
\ No newline at end of file
+ docker push "${REGISTRY_HOST}/${CV_SEARCH_JOB_IMAGE}:${IMAGE_TAG}"
+
+ - name: Push Page Fetcher API image
+ run: |
+ docker push "${REGISTRY_HOST}/${PAGE_FETCHER_API_IMAGE}:${IMAGE_TAG}"
\ No newline at end of file
diff --git a/Apis/common/Settings/PageFetcherApiSettings.cs b/Apis/common/Settings/PageFetcherApiSettings.cs
new file mode 100644
index 0000000..c6367ac
--- /dev/null
+++ b/Apis/common/Settings/PageFetcherApiSettings.cs
@@ -0,0 +1,11 @@
+namespace Common.Settings;
+
+///
+/// Connection settings for the internal page-fetcher-api service.
+/// Bound from the PageFetcherApi configuration section.
+///
+public sealed class PageFetcherApiSettings
+{
+ public string BaseUrl { get; set; } = string.Empty;
+ public string InternalApiKey { get; set; } = string.Empty;
+}
diff --git a/Apis/cv-matcher-api/Program.cs b/Apis/cv-matcher-api/Program.cs
index f247251..0fd642c 100644
--- a/Apis/cv-matcher-api/Program.cs
+++ b/Apis/cv-matcher-api/Program.cs
@@ -13,6 +13,7 @@ using Microsoft.EntityFrameworkCore;
using Refit;
using Serilog;
using Common.Settings;
+using PageFetcher.Models;
using StartupHelpers;
using System.Reflection;
@@ -36,6 +37,16 @@ try
builder.Services.Configure(builder.Configuration.GetSection("Ai"));
builder.Services.Configure(builder.Configuration.GetSection("Matcher"));
builder.Services.Configure(builder.Configuration.GetSection("JobSearch"));
+ builder.Services.Configure(builder.Configuration.GetSection("PageFetcherApi"));
+
+ builder.Services.AddRefitClient()
+ .ConfigureHttpClient((sp, c) =>
+ {
+ var settings = sp.GetRequiredService>().Value;
+ c.BaseAddress = new Uri(settings.BaseUrl.TrimEnd('/') + "/");
+ if (!string.IsNullOrWhiteSpace(settings.InternalApiKey))
+ c.DefaultRequestHeaders.Add("X-Internal-Api-Key", settings.InternalApiKey);
+ });
builder.Services.AddRefitClient()
.ConfigureHttpClient((sp, c) =>
@@ -50,7 +61,7 @@ try
builder.Services.AddScoped();
builder.Services.AddHttpClient();
- builder.Services.AddHttpClient();
+ builder.Services.AddScoped();
builder.Services.AddDbContext(options =>
{
diff --git a/Apis/cv-matcher-api/Services/JobTextExtractor.cs b/Apis/cv-matcher-api/Services/JobTextExtractor.cs
index f8e806b..c16b201 100644
--- a/Apis/cv-matcher-api/Services/JobTextExtractor.cs
+++ b/Apis/cv-matcher-api/Services/JobTextExtractor.cs
@@ -1,26 +1,23 @@
-using System.Net;
-using System.Text.RegularExpressions;
using CvMatcher.Models.Settings;
using Api.Services.Contracts;
using Microsoft.Extensions.Options;
+using PageFetcher.Models;
namespace Api.Services;
///
/// Extracts normalised plain text from a job posting, either from a pasted description or by
-/// fetching and stripping the HTML of the job page URL.
+/// fetching the job page text via page-fetcher-api (headless Chromium rendering).
///
public sealed class JobTextExtractor : IJobTextExtractor
{
- private readonly HttpClient _http;
+ private readonly IPageFetcherApiClient _pageFetcher;
private readonly MatcherSettings _settings;
- public JobTextExtractor(HttpClient http, IOptions options)
+ public JobTextExtractor(IPageFetcherApiClient pageFetcher, IOptions options)
{
- _http = http;
+ _pageFetcher = pageFetcher;
_settings = options.Value;
- _http.Timeout = TimeSpan.FromSeconds(25);
- _http.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0");
}
///
@@ -31,15 +28,18 @@ public sealed class JobTextExtractor : IJobTextExtractor
if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty;
if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || uri.Scheme is not ("http" or "https"))
- {
throw new InvalidOperationException("Invalid job URL.");
- }
- var html = await _http.GetStringAsync(uri, ct);
- html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase);
+ text = Regex.Replace(text, "