using CvMatcher.Models.Settings; using Api.Services.Contracts; using Microsoft.Extensions.Options; using PageFetcher.Models; namespace Api.Services; /// /// Extracts normalised plain text from a job posting, either from a pasted description or by /// fetching the job page text via page-fetcher-api (headless Chromium rendering). /// public sealed class JobTextExtractor : IJobTextExtractor { private readonly IPageFetcherApiClient _pageFetcher; private readonly MatcherSettings _settings; public JobTextExtractor(IPageFetcherApiClient pageFetcher, IOptions options) { _pageFetcher = pageFetcher; _settings = options.Value; } /// public async Task ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct) { var pasted = Normalize(jobDescription ?? string.Empty); if (!string.IsNullOrWhiteSpace(pasted)) return Limit(pasted); if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty; if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || uri.Scheme is not ("http" or "https")) throw new InvalidOperationException("Invalid job URL."); var response = await _pageFetcher.FetchAsync(new FetchPageRequest { Url = jobUrl, CallerService = "cv-matcher-api" }, ct); if (!response.Success) throw new InvalidOperationException($"Failed to fetch job page: {response.Error}"); return Limit(Normalize(response.Text)); } /// Truncates text to the configured maximum character count. private string Limit(string value) { var max = Math.Max(4000, _settings.MaxJobTextChars); return value.Length <= max ? value : value[..max]; } /// Collapses all whitespace runs to single spaces and trims the result. private static string Normalize(string value) { if (string.IsNullOrWhiteSpace(value)) return string.Empty; return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim(); } }