@@ -0,0 +1,169 @@
|
||||
using Api.Models.Rag;
|
||||
using Api.Settings;
|
||||
using Microsoft.Extensions.Options;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface ICvRagService
|
||||
{
|
||||
Task<CvIngestResponse> IngestCvAsync(IFormFile file, bool gdprConsent, CancellationToken ct);
|
||||
Task<JobMatchResponse> MatchJobAsync(JobMatchRequest request, CancellationToken ct);
|
||||
}
|
||||
|
||||
public sealed class CvRagService : ICvRagService
|
||||
{
|
||||
private readonly IPdfTextExtractor _pdfTextExtractor;
|
||||
private readonly ITextChunker _textChunker;
|
||||
private readonly IOpenAiRagClient _openAi;
|
||||
private readonly ICvVectorStore _store;
|
||||
private readonly IJobTextExtractor _jobTextExtractor;
|
||||
private readonly RagSettings _settings;
|
||||
private readonly ILogger<CvRagService> _logger;
|
||||
|
||||
public CvRagService(
|
||||
IPdfTextExtractor pdfTextExtractor,
|
||||
ITextChunker textChunker,
|
||||
IOpenAiRagClient openAi,
|
||||
ICvVectorStore store,
|
||||
IJobTextExtractor jobTextExtractor,
|
||||
IOptions<RagSettings> options,
|
||||
ILogger<CvRagService> logger)
|
||||
{
|
||||
_pdfTextExtractor = pdfTextExtractor;
|
||||
_textChunker = textChunker;
|
||||
_openAi = openAi;
|
||||
_store = store;
|
||||
_jobTextExtractor = jobTextExtractor;
|
||||
_settings = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<CvIngestResponse> IngestCvAsync(IFormFile file, bool gdprConsent, CancellationToken ct)
|
||||
{
|
||||
if (!gdprConsent) throw new InvalidOperationException("GDPR consent is required.");
|
||||
if (file.Length == 0) throw new InvalidOperationException("CV PDF is empty.");
|
||||
if (file.Length > _settings.MaxPdfSizeMb * 1024L * 1024L) throw new InvalidOperationException($"PDF is too large. Max size is {_settings.MaxPdfSizeMb} MB.");
|
||||
if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are accepted.");
|
||||
|
||||
await using var stream = file.OpenReadStream();
|
||||
var text = _pdfTextExtractor.ExtractText(stream);
|
||||
if (text.Length < 80) throw new InvalidOperationException("Could not extract enough text from this PDF.");
|
||||
|
||||
var documentId = $"cv_{Guid.NewGuid():N}";
|
||||
var expiresAt = DateTimeOffset.UtcNow.AddMinutes(Math.Max(10, _settings.CvTtlMinutes));
|
||||
var chunks = _textChunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap);
|
||||
|
||||
var stored = new List<StoredCvChunk>();
|
||||
for (var i = 0; i < chunks.Count; i++)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
stored.Add(new StoredCvChunk
|
||||
{
|
||||
Id = Guid.NewGuid().ToString("N"),
|
||||
DocumentId = documentId,
|
||||
Text = chunks[i],
|
||||
Embedding = await _openAi.CreateEmbeddingAsync(chunks[i], ct),
|
||||
ChunkIndex = i,
|
||||
ExpiresAt = expiresAt
|
||||
});
|
||||
}
|
||||
|
||||
_store.Save(documentId, stored);
|
||||
var summary = await SummarizeCvAsync(text, ct);
|
||||
return new CvIngestResponse(documentId, stored.Count, text.Length, summary);
|
||||
}
|
||||
|
||||
public async Task<JobMatchResponse> MatchJobAsync(JobMatchRequest request, CancellationToken ct)
|
||||
{
|
||||
if (!request.GdprConsent) throw new InvalidOperationException("GDPR consent is required.");
|
||||
if (string.IsNullOrWhiteSpace(request.CvDocumentId)) throw new InvalidOperationException("Missing CV document id.");
|
||||
|
||||
var cvChunks = _store.Get(request.CvDocumentId);
|
||||
if (cvChunks.Count == 0) throw new InvalidOperationException("CV context was not found or has expired. Upload the CV again.");
|
||||
|
||||
var jobText = await _jobTextExtractor.ExtractAsync(request.JobUrl, request.JobDescription, ct);
|
||||
if (jobText.Length < 80) throw new InvalidOperationException("Could not extract enough job text. Paste the job description manually.");
|
||||
|
||||
var jobEmbedding = await _openAi.CreateEmbeddingAsync(jobText, ct);
|
||||
var retrieved = _store.Search(request.CvDocumentId, jobEmbedding, _settings.TopK);
|
||||
var cvContext = string.Join("\n\n", retrieved.Select(x => $"CV chunk {x.ChunkIndex} | similarity {x.Score:0.000}:\n{x.Text}"));
|
||||
|
||||
var systemPrompt = "You are a strict senior technical recruiter and AI CV matcher. Return only valid JSON. Do not invent candidate experience. Use only the supplied CV context and job text.";
|
||||
var userPrompt = $$"""
|
||||
Compare the candidate CV context with the job description.
|
||||
Return this JSON shape exactly:
|
||||
{
|
||||
"score": 0,
|
||||
"summary": "short direct assessment",
|
||||
"strengths": ["strength 1"],
|
||||
"gaps": ["gap 1"],
|
||||
"recommendations": ["action 1"],
|
||||
"evidence": ["short CV evidence quote or paraphrase"]
|
||||
}
|
||||
Score must be 0-100.
|
||||
|
||||
CV CONTEXT:
|
||||
{{cvContext}}
|
||||
|
||||
JOB DESCRIPTION:
|
||||
{{jobText}}
|
||||
""";
|
||||
|
||||
var content = await _openAi.CreateChatCompletionAsync(systemPrompt, userPrompt, ct);
|
||||
var response = ParseMatchResponse(content);
|
||||
if (response.Evidence.Count == 0)
|
||||
{
|
||||
response.Evidence = retrieved.Select(x => x.Text.Length > 280 ? x.Text[..280] + "..." : x.Text).ToList();
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
private async Task<string> SummarizeCvAsync(string cvText, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var shortened = cvText.Length > 8000 ? cvText[..8000] : cvText;
|
||||
var content = await _openAi.CreateChatCompletionAsync(
|
||||
"Return only valid JSON.",
|
||||
$$"""
|
||||
Summarize this CV in one concise sentence. Return JSON: { "summary": "..." }
|
||||
|
||||
CV:
|
||||
{{shortened}}
|
||||
""",
|
||||
ct);
|
||||
using var doc = JsonDocument.Parse(content);
|
||||
return doc.RootElement.TryGetProperty("summary", out var summary) ? summary.GetString() ?? "CV indexed." : "CV indexed.";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "CV summary failed");
|
||||
return "CV indexed.";
|
||||
}
|
||||
}
|
||||
|
||||
private static JobMatchResponse ParseMatchResponse(string content)
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = JsonSerializer.Deserialize<JobMatchResponse>(content, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }) ?? new JobMatchResponse();
|
||||
response.Score = Math.Clamp(response.Score, 0, 100);
|
||||
response.Strengths ??= [];
|
||||
response.Gaps ??= [];
|
||||
response.Recommendations ??= [];
|
||||
response.Evidence ??= [];
|
||||
return response;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return new JobMatchResponse
|
||||
{
|
||||
Score = 0,
|
||||
Summary = "The AI response could not be parsed. Check logs and prompt output.",
|
||||
Gaps = ["Invalid JSON returned by the model."],
|
||||
Evidence = []
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
using Api.Models.Rag;
|
||||
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface ICvVectorStore
|
||||
{
|
||||
void Save(string documentId, IEnumerable<StoredCvChunk> chunks);
|
||||
IReadOnlyList<StoredCvChunk> Get(string documentId);
|
||||
IReadOnlyList<RetrievedCvChunk> Search(string documentId, float[] queryEmbedding, int topK);
|
||||
}
|
||||
|
||||
public sealed class InMemoryCvVectorStore : ICvVectorStore
|
||||
{
|
||||
private readonly object _lock = new();
|
||||
private readonly Dictionary<string, List<StoredCvChunk>> _store = new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
public void Save(string documentId, IEnumerable<StoredCvChunk> chunks)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
CleanupExpiredUnsafe();
|
||||
_store[documentId] = chunks.ToList();
|
||||
}
|
||||
}
|
||||
|
||||
public IReadOnlyList<StoredCvChunk> Get(string documentId)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
CleanupExpiredUnsafe();
|
||||
return _store.TryGetValue(documentId, out var chunks) ? chunks.ToList() : [];
|
||||
}
|
||||
}
|
||||
|
||||
public IReadOnlyList<RetrievedCvChunk> Search(string documentId, float[] queryEmbedding, int topK)
|
||||
{
|
||||
var chunks = Get(documentId);
|
||||
if (chunks.Count == 0) return [];
|
||||
|
||||
return chunks
|
||||
.Select(chunk => new RetrievedCvChunk
|
||||
{
|
||||
Text = chunk.Text,
|
||||
ChunkIndex = chunk.ChunkIndex,
|
||||
Score = CosineSimilarity(queryEmbedding, chunk.Embedding)
|
||||
})
|
||||
.OrderByDescending(x => x.Score)
|
||||
.Take(Math.Clamp(topK, 1, 12))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private void CleanupExpiredUnsafe()
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
foreach (var key in _store.Where(x => x.Value.All(c => c.ExpiresAt <= now)).Select(x => x.Key).ToList())
|
||||
{
|
||||
_store.Remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
private static double CosineSimilarity(float[] a, float[] b)
|
||||
{
|
||||
if (a.Length != b.Length || a.Length == 0) return 0;
|
||||
|
||||
double dot = 0;
|
||||
double magA = 0;
|
||||
double magB = 0;
|
||||
|
||||
for (var i = 0; i < a.Length; i++)
|
||||
{
|
||||
dot += a[i] * b[i];
|
||||
magA += a[i] * a[i];
|
||||
magB += b[i] * b[i];
|
||||
}
|
||||
|
||||
if (magA == 0 || magB == 0) return 0;
|
||||
return dot / (Math.Sqrt(magA) * Math.Sqrt(magB));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
using System.Net;
|
||||
using System.Text.RegularExpressions;
|
||||
using Api.Settings;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface IJobTextExtractor
|
||||
{
|
||||
Task<string> ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct);
|
||||
}
|
||||
|
||||
public sealed class JobTextExtractor : IJobTextExtractor
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly RagSettings _settings;
|
||||
|
||||
public JobTextExtractor(HttpClient httpClient, IOptions<RagSettings> options)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_settings = options.Value;
|
||||
_httpClient.Timeout = TimeSpan.FromSeconds(20);
|
||||
_httpClient.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0");
|
||||
}
|
||||
|
||||
public async Task<string> ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct)
|
||||
{
|
||||
var pasted = Normalize(jobDescription ?? string.Empty);
|
||||
if (!string.IsNullOrWhiteSpace(pasted)) return Limit(pasted);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty;
|
||||
if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || (uri.Scheme != "http" && uri.Scheme != "https"))
|
||||
{
|
||||
throw new InvalidOperationException("Invalid job URL.");
|
||||
}
|
||||
|
||||
var html = await _httpClient.GetStringAsync(uri, ct);
|
||||
html = Regex.Replace(html, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
|
||||
html = Regex.Replace(html, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
|
||||
html = Regex.Replace(html, "<[^>]+>", " ");
|
||||
var text = WebUtility.HtmlDecode(html);
|
||||
return Limit(Normalize(text));
|
||||
}
|
||||
|
||||
private string Limit(string value)
|
||||
{
|
||||
var max = Math.Max(4000, _settings.MaxJobTextChars);
|
||||
return value.Length <= max ? value : value[..max];
|
||||
}
|
||||
|
||||
private static string Normalize(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
|
||||
return string.Join(' ', parts).Trim();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
using System.Net.Http.Headers;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Api.Settings;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface IOpenAiRagClient
|
||||
{
|
||||
Task<float[]> CreateEmbeddingAsync(string input, CancellationToken ct);
|
||||
Task<string> CreateChatCompletionAsync(string systemPrompt, string userPrompt, CancellationToken ct);
|
||||
}
|
||||
|
||||
public sealed class OpenAiRagClient : IOpenAiRagClient
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly OpenAiSettings _settings;
|
||||
private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web)
|
||||
{
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
|
||||
};
|
||||
|
||||
public OpenAiRagClient(HttpClient httpClient, IOptions<OpenAiSettings> options)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_settings = options.Value;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(_settings.ApiKey))
|
||||
{
|
||||
_httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", _settings.ApiKey);
|
||||
}
|
||||
|
||||
_httpClient.Timeout = TimeSpan.FromSeconds(Math.Max(15, _settings.TimeoutSeconds));
|
||||
_httpClient.BaseAddress = new Uri("https://api.openai.com/v1/");
|
||||
}
|
||||
|
||||
public async Task<float[]> CreateEmbeddingAsync(string input, CancellationToken ct)
|
||||
{
|
||||
EnsureConfigured();
|
||||
var payload = new { model = _settings.EmbeddingModel, input };
|
||||
using var response = await _httpClient.PostAsync("embeddings", ToJson(payload), ct);
|
||||
var json = await response.Content.ReadAsStringAsync(ct);
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
throw new InvalidOperationException($"OpenAI embeddings request failed: {(int)response.StatusCode} {json}");
|
||||
}
|
||||
|
||||
using var document = JsonDocument.Parse(json);
|
||||
var embedding = document.RootElement.GetProperty("data")[0].GetProperty("embedding");
|
||||
var result = new float[embedding.GetArrayLength()];
|
||||
var i = 0;
|
||||
foreach (var value in embedding.EnumerateArray())
|
||||
{
|
||||
result[i++] = value.GetSingle();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public async Task<string> CreateChatCompletionAsync(string systemPrompt, string userPrompt, CancellationToken ct)
|
||||
{
|
||||
EnsureConfigured();
|
||||
var payload = new
|
||||
{
|
||||
model = _settings.ChatModel,
|
||||
temperature = 0.2,
|
||||
response_format = new { type = "json_object" },
|
||||
messages = new[]
|
||||
{
|
||||
new { role = "system", content = systemPrompt },
|
||||
new { role = "user", content = userPrompt }
|
||||
}
|
||||
};
|
||||
|
||||
using var response = await _httpClient.PostAsync("chat/completions", ToJson(payload), ct);
|
||||
var json = await response.Content.ReadAsStringAsync(ct);
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
throw new InvalidOperationException($"OpenAI chat request failed: {(int)response.StatusCode} {json}");
|
||||
}
|
||||
|
||||
using var document = JsonDocument.Parse(json);
|
||||
return document.RootElement
|
||||
.GetProperty("choices")[0]
|
||||
.GetProperty("message")
|
||||
.GetProperty("content")
|
||||
.GetString() ?? "{}";
|
||||
}
|
||||
|
||||
private void EnsureConfigured()
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(_settings.ApiKey))
|
||||
{
|
||||
throw new InvalidOperationException("OpenAI API key is not configured. Set OpenAI__ApiKey.");
|
||||
}
|
||||
}
|
||||
|
||||
private static StringContent ToJson<T>(T payload) => new(
|
||||
JsonSerializer.Serialize(payload, JsonOptions),
|
||||
Encoding.UTF8,
|
||||
"application/json"
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
using System.Text;
|
||||
using UglyToad.PdfPig;
|
||||
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface IPdfTextExtractor
|
||||
{
|
||||
string ExtractText(Stream pdfStream);
|
||||
}
|
||||
|
||||
public sealed class PdfTextExtractor : IPdfTextExtractor
|
||||
{
|
||||
public string ExtractText(Stream pdfStream)
|
||||
{
|
||||
using var document = PdfDocument.Open(pdfStream);
|
||||
var builder = new StringBuilder();
|
||||
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
builder.AppendLine(page.Text);
|
||||
builder.AppendLine();
|
||||
}
|
||||
|
||||
return NormalizeWhitespace(builder.ToString());
|
||||
}
|
||||
|
||||
private static string NormalizeWhitespace(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries);
|
||||
return string.Join(' ', parts).Trim();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
namespace Api.Services.Rag;
|
||||
|
||||
public interface ITextChunker
|
||||
{
|
||||
IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap);
|
||||
}
|
||||
|
||||
public sealed class TextChunker : ITextChunker
|
||||
{
|
||||
public IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return [];
|
||||
chunkSize = Math.Clamp(chunkSize, 300, 3000);
|
||||
overlap = Math.Clamp(overlap, 0, chunkSize / 2);
|
||||
|
||||
var chunks = new List<string>();
|
||||
var start = 0;
|
||||
while (start < text.Length)
|
||||
{
|
||||
var length = Math.Min(chunkSize, text.Length - start);
|
||||
chunks.Add(text.Substring(start, length).Trim());
|
||||
start += chunkSize - overlap;
|
||||
}
|
||||
|
||||
return chunks.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user