using System.Net; using System.Text.RegularExpressions; using Api.Settings; using Microsoft.Extensions.Options; namespace Api.Services.Rag; public interface IJobTextExtractor { Task ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct); } public sealed class JobTextExtractor : IJobTextExtractor { private readonly HttpClient _httpClient; private readonly RagSettings _settings; public JobTextExtractor(HttpClient httpClient, IOptions options) { _httpClient = httpClient; _settings = options.Value; _httpClient.Timeout = TimeSpan.FromSeconds(20); _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0"); } public async Task ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct) { var pasted = Normalize(jobDescription ?? string.Empty); if (!string.IsNullOrWhiteSpace(pasted)) return Limit(pasted); if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty; if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || (uri.Scheme != "http" && uri.Scheme != "https")) { throw new InvalidOperationException("Invalid job URL."); } var html = await _httpClient.GetStringAsync(uri, ct); html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase); html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase); html = Regex.Replace(html, "<[^>]+>", " "); var text = WebUtility.HtmlDecode(html); return Limit(Normalize(text)); } private string Limit(string value) { var max = Math.Max(4000, _settings.MaxJobTextChars); return value.Length <= max ? value : value[..max]; } private static string Normalize(string value) { if (string.IsNullOrWhiteSpace(value)) return string.Empty; var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries); return string.Join(' ', parts).Trim(); } }