Changes
Build and Push Docker Images / build (push) Successful in 42s

This commit is contained in:
2026-05-04 15:56:15 +03:00
parent 540720e771
commit 2dce2ab0ff
14 changed files with 656 additions and 5 deletions
+57
View File
@@ -0,0 +1,57 @@
using System.Net;
using System.Text.RegularExpressions;
using Api.Settings;
using Microsoft.Extensions.Options;
namespace Api.Services.Rag;
public interface IJobTextExtractor
{
Task<string> ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct);
}
public sealed class JobTextExtractor : IJobTextExtractor
{
private readonly HttpClient _httpClient;
private readonly RagSettings _settings;
public JobTextExtractor(HttpClient httpClient, IOptions<RagSettings> options)
{
_httpClient = httpClient;
_settings = options.Value;
_httpClient.Timeout = TimeSpan.FromSeconds(20);
_httpClient.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0");
}
public async Task<string> ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct)
{
var pasted = Normalize(jobDescription ?? string.Empty);
if (!string.IsNullOrWhiteSpace(pasted)) return Limit(pasted);
if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty;
if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || (uri.Scheme != "http" && uri.Scheme != "https"))
{
throw new InvalidOperationException("Invalid job URL.");
}
var html = await _httpClient.GetStringAsync(uri, ct);
html = Regex.Replace(html, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<[^>]+>", " ");
var text = WebUtility.HtmlDecode(html);
return Limit(Normalize(text));
}
private string Limit(string value)
{
var max = Math.Max(4000, _settings.MaxJobTextChars);
return value.Length <= max ? value : value[..max];
}
private static string Normalize(string value)
{
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
return string.Join(' ', parts).Trim();
}
}