diff --git a/RAG_SPLIT_README.md b/RAG_SPLIT_README.md new file mode 100644 index 0000000..6a39dcc --- /dev/null +++ b/RAG_SPLIT_README.md @@ -0,0 +1,116 @@ +# MyAi RAG split cleanup + +## Public `api` + +The existing `api` project is now only the public gateway for the existing frontend. + +It keeps: + +- contact API +- file download API +- Google/config APIs +- health API +- `api/rag/*` proxy endpoints + +It no longer contains local RAG processing code. The removed responsibilities are: + +- PDF extraction +- chunking +- embeddings +- vector storage +- OpenAI/Ollama calls +- job text extraction +- CV matching business logic + +`api/Controllers/RagController.cs` is intentionally kept. It proxies the current frontend calls: + +- `POST /api/rag/cv` -> `cv-matcher-api /api/cv/upload` +- `POST /api/rag/match-job` -> `cv-matcher-api /api/cv/match-job` + +Required public API config: + +```json +"CvMatcherApi": { + "BaseUrl": "http://cv-matcher-api:8080", + "InternalApiKey": "change-this-internal-key" +} +``` + +## `cv-matcher-api` + +Business API for CV/job workflows. + +Main endpoints: + +- `POST /api/cv/upload` +- `POST /api/cv/match-job` +- `POST /api/cv/find-jobs` +- `GET /health` +- Swagger: `/swagger` + +Responsibilities: + +- CV matcher business logic +- job URL/text extraction +- final LLM scoring +- result persistence +- email sending +- calls `rag-api` for generic semantic indexing/search + +## `rag-api` + +Generic semantic search API. + +Main endpoints: + +- `POST /api/rag/documents` +- `POST /api/rag/documents/json` +- `POST /api/rag/search` +- `GET /api/rag/documents/{id}` +- `GET /health` +- Swagger: `/swagger` + +Responsibilities: + +- generic document indexing +- automatic document type classification when type is missing +- PDF/text extraction +- chunking +- embedding creation +- embedding and chat completion cache +- semantic search over generic documents + +## Logging and Swagger + +All three APIs now have: + +- Serilog startup logging +- Serilog request logging +- structured JSON console logs +- health endpoint +- Swagger/OpenAPI support + +Swagger is enabled by default and can be disabled per service with: + +```json +"Swagger": { + "Enabled": false +} +``` + +## Internal API security + +Both internal APIs support API-key protection: + +```json +"InternalApi": { + "RequireApiKey": true, + "ApiKey": "change-this-internal-key" +} +``` + +Requests must include: + +```http +X-Internal-Api-Key: change-this-internal-key +``` diff --git a/api/Controllers/ContactController.cs b/api/Controllers/ContactController.cs index cc2e067..d00017d 100644 --- a/api/Controllers/ContactController.cs +++ b/api/Controllers/ContactController.cs @@ -1,4 +1,5 @@ -using Api.Models; +using Api.Services.Contracts.Models; +using Api.Requests; using Api.Services.Contracts; using Api.Settings; using Microsoft.AspNetCore.Cors; @@ -118,7 +119,7 @@ namespace Api.Controllers /// Client-provided reCAPTCHA token. /// Cancellation token. /// Tuple containing the verification verdict and user IP. - private async Task<(CaptchaVerdict Verdict, string? UserIp)> ValidateCaptcha(string token, CancellationToken ct) + private async Task<(CaptchaVerdictModel Verdict, string? UserIp)> ValidateCaptcha(string token, CancellationToken ct) { var userIp = HttpContext.Connection.RemoteIpAddress?.ToString(); var verdict = await _captcha.VerifyAsync(token, userIp, ct); diff --git a/api/Controllers/RagController.cs b/api/Controllers/RagController.cs index d9d8554..94fb7d5 100644 --- a/api/Controllers/RagController.cs +++ b/api/Controllers/RagController.cs @@ -1,8 +1,9 @@ -using api.Services.Contracts.Rag; -using Api.Models.Rag; -using Api.Services.Rag; +using Api.Requests; using Microsoft.AspNetCore.Mvc; using Microsoft.AspNetCore.RateLimiting; +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; namespace Api.Controllers; @@ -11,52 +12,135 @@ namespace Api.Controllers; [EnableRateLimiting("rag")] public sealed class RagController : ControllerBase { - private readonly ICvRagService _cvRagService; + private readonly IHttpClientFactory _httpClientFactory; + private readonly IConfiguration _configuration; private readonly ILogger _logger; - public RagController(ICvRagService cvRagService, ILogger logger) + public RagController( + IHttpClientFactory httpClientFactory, + IConfiguration configuration, + ILogger logger) { - _cvRagService = cvRagService; + _httpClientFactory = httpClientFactory; + _configuration = configuration; _logger = logger; } [HttpPost("cv")] [RequestSizeLimit(8 * 1024 * 1024)] - public async Task UploadCv([FromForm(Name = "cv")] IFormFile? cv, [FromForm] bool gdprConsent, CancellationToken ct) + [ProducesResponseType(StatusCodes.Status200OK)] + [ProducesResponseType(StatusCodes.Status400BadRequest)] + [ProducesResponseType(StatusCodes.Status502BadGateway)] + public async Task UploadCv( + [FromForm(Name = "cv")] IFormFile? cv, + [FromForm] bool gdprConsent, + CancellationToken ct) { + if (cv is null) + { + return BadRequest(new { error = "Missing CV PDF." }); + } + + var baseUrl = GetCvMatcherBaseUrl(); + if (string.IsNullOrWhiteSpace(baseUrl)) + { + _logger.LogError("CvMatcherApi:BaseUrl is not configured. The public API cannot proxy CV upload requests."); + return StatusCode(StatusCodes.Status502BadGateway, new { error = "CV matcher API is not configured." }); + } + try { - if (cv is null) return BadRequest(new { error = "Missing CV PDF." }); - var result = await _cvRagService.IngestCvAsync(cv, gdprConsent, ct); - return Ok(result); + _logger.LogInformation("Proxying CV upload to cv-matcher-api. FileName={FileName}, Size={SizeBytes}, GdprConsent={GdprConsent}", + cv.FileName, cv.Length, gdprConsent); + + using var client = CreateCvMatcherClient(baseUrl); + using var form = new MultipartFormDataContent(); + await using var stream = cv.OpenReadStream(); + using var fileContent = new StreamContent(stream); + fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/pdf"); + form.Add(fileContent, "cv", cv.FileName); + form.Add(new StringContent(gdprConsent.ToString().ToLowerInvariant()), "gdprConsent"); + + using var response = await client.PostAsync("api/cv/upload", form, ct); + return await ProxyResponseAsync(response, ct); } - catch (InvalidOperationException ex) + catch (OperationCanceledException) when (ct.IsCancellationRequested) { - return BadRequest(new { error = ex.Message }); + _logger.LogWarning("CV upload proxy request was cancelled by the client."); + return StatusCode(499, new { error = "Request cancelled." }); } catch (Exception ex) { - _logger.LogError(ex, "CV ingestion failed"); - return StatusCode(500, new { error = "CV ingestion failed." }); + _logger.LogError(ex, "CV upload proxy request failed."); + return StatusCode(StatusCodes.Status502BadGateway, new { error = "CV matcher API request failed." }); } } [HttpPost("match-job")] + [ProducesResponseType(StatusCodes.Status200OK)] + [ProducesResponseType(StatusCodes.Status400BadRequest)] + [ProducesResponseType(StatusCodes.Status502BadGateway)] public async Task MatchJob([FromBody] JobMatchRequest request, CancellationToken ct) { + var baseUrl = GetCvMatcherBaseUrl(); + if (string.IsNullOrWhiteSpace(baseUrl)) + { + _logger.LogError("CvMatcherApi:BaseUrl is not configured. The public API cannot proxy job matching requests."); + return StatusCode(StatusCodes.Status502BadGateway, new { error = "CV matcher API is not configured." }); + } + try { - var result = await _cvRagService.MatchJobAsync(request, ct); - return Ok(result); + _logger.LogInformation("Proxying job match request to cv-matcher-api. CvDocumentId={CvDocumentId}, HasJobUrl={HasJobUrl}, HasJobDescription={HasJobDescription}", + request.CvDocumentId, + !string.IsNullOrWhiteSpace(request.JobUrl), + !string.IsNullOrWhiteSpace(request.JobDescription)); + + using var client = CreateCvMatcherClient(baseUrl); + var json = JsonSerializer.Serialize(request, new JsonSerializerOptions(JsonSerializerDefaults.Web)); + using var response = await client.PostAsync( + "api/cv/match-job", + new StringContent(json, Encoding.UTF8, "application/json"), + ct); + + return await ProxyResponseAsync(response, ct); } - catch (InvalidOperationException ex) + catch (OperationCanceledException) when (ct.IsCancellationRequested) { - return BadRequest(new { error = ex.Message }); + _logger.LogWarning("Job match proxy request was cancelled by the client."); + return StatusCode(499, new { error = "Request cancelled." }); } catch (Exception ex) { - _logger.LogError(ex, "Job matching failed"); - return StatusCode(500, new { error = "Job matching failed." }); + _logger.LogError(ex, "Job match proxy request failed."); + return StatusCode(StatusCodes.Status502BadGateway, new { error = "CV matcher API request failed." }); } } + + private string GetCvMatcherBaseUrl() => _configuration["CvMatcherApi:BaseUrl"] ?? string.Empty; + + private HttpClient CreateCvMatcherClient(string baseUrl) + { + var client = _httpClientFactory.CreateClient("CvMatcherApi"); + client.BaseAddress = new Uri(baseUrl.TrimEnd('/') + "/"); + + var key = _configuration["CvMatcherApi:InternalApiKey"]; + if (!string.IsNullOrWhiteSpace(key) && !client.DefaultRequestHeaders.Contains("X-Internal-Api-Key")) + { + client.DefaultRequestHeaders.Add("X-Internal-Api-Key", key); + } + + return client; + } + + private static async Task ProxyResponseAsync(HttpResponseMessage response, CancellationToken ct) + { + var body = await response.Content.ReadAsStringAsync(ct); + return new ContentResult + { + StatusCode = (int)response.StatusCode, + Content = body, + ContentType = response.Content.Headers.ContentType?.ToString() ?? "application/json" + }; + } } diff --git a/api/Models/Rag/RagModels.cs b/api/Models/Rag/RagModels.cs deleted file mode 100644 index 9e4191e..0000000 --- a/api/Models/Rag/RagModels.cs +++ /dev/null @@ -1,43 +0,0 @@ -namespace Api.Models.Rag; - -public sealed record CvIngestResponse( - string DocumentId, - int Chunks, - int CharactersExtracted, - string Summary -); - -public sealed class JobMatchRequest -{ - public string? CvDocumentId { get; set; } - public string? JobUrl { get; set; } - public string? JobDescription { get; set; } - public bool GdprConsent { get; set; } -} - -public sealed class JobMatchResponse -{ - public int Score { get; set; } - public string Summary { get; set; } = string.Empty; - public List Strengths { get; set; } = []; - public List Gaps { get; set; } = []; - public List Recommendations { get; set; } = []; - public List Evidence { get; set; } = []; -} - -public sealed class StoredCvChunk -{ - public required string Id { get; init; } - public required string DocumentId { get; init; } - public required string Text { get; init; } - public required float[] Embedding { get; init; } - public required int ChunkIndex { get; init; } - public DateTimeOffset ExpiresAt { get; init; } -} - -public sealed class RetrievedCvChunk -{ - public required string Text { get; init; } - public required int ChunkIndex { get; init; } - public double Score { get; init; } -} diff --git a/api/Program.cs b/api/Program.cs index f902e32..3ad17ec 100644 --- a/api/Program.cs +++ b/api/Program.cs @@ -1,8 +1,5 @@ -using api.Services.Contracts.Rag; using Api.Services; using Api.Services.Contracts; -using Api.Services.Contracts.Rag; -using Api.Services.Rag; using Api.Settings; using Azure.Identity; using Microsoft.AspNetCore.HttpOverrides; @@ -78,19 +75,12 @@ try builder.Services.Configure(builder.Configuration.GetSection("Smtp")); builder.Services.Configure(builder.Configuration.GetSection("Captcha")); builder.Services.Configure(builder.Configuration.GetSection("FileStorage")); - builder.Services.Configure(builder.Configuration.GetSection("Rag")); - builder.Services.Configure(builder.Configuration.GetSection("OpenAI")); // Services builder.Services.AddHttpClient(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); - builder.Services.AddSingleton(); - builder.Services.AddSingleton(); - builder.Services.AddSingleton(); - builder.Services.AddScoped(); - builder.Services.AddHttpClient(); - builder.Services.AddHttpClient(); + builder.Services.AddHttpClient("CvMatcherApi"); // Swagger builder.Services.AddEndpointsApiExplorer(); diff --git a/api/Models/ContactRequest.cs b/api/Requests/ContactRequest.cs similarity index 96% rename from api/Models/ContactRequest.cs rename to api/Requests/ContactRequest.cs index 900650e..ab58d53 100644 --- a/api/Models/ContactRequest.cs +++ b/api/Requests/ContactRequest.cs @@ -1,6 +1,6 @@ using System.ComponentModel.DataAnnotations; -namespace Api.Models +namespace Api.Requests { public sealed class ContactRequest { diff --git a/api/Requests/JobMatchRequest.cs b/api/Requests/JobMatchRequest.cs new file mode 100644 index 0000000..f45040a --- /dev/null +++ b/api/Requests/JobMatchRequest.cs @@ -0,0 +1,9 @@ +namespace Api.Requests; + +public sealed class JobMatchRequest +{ + public string? CvDocumentId { get; set; } + public string? JobUrl { get; set; } + public string? JobDescription { get; set; } + public bool GdprConsent { get; set; } +} diff --git a/api/Models/SubscribeRequest.cs b/api/Requests/SubscribeRequest.cs similarity index 93% rename from api/Models/SubscribeRequest.cs rename to api/Requests/SubscribeRequest.cs index eb1a1f8..09070a3 100644 --- a/api/Models/SubscribeRequest.cs +++ b/api/Requests/SubscribeRequest.cs @@ -1,6 +1,6 @@ using System.ComponentModel.DataAnnotations; -namespace Api.Models +namespace Api.Requests { public sealed class SubscribeRequest { diff --git a/api/Services/Contracts/ICaptchaVerifier.cs b/api/Services/Contracts/ICaptchaVerifier.cs index 20cc4fd..55e7649 100644 --- a/api/Services/Contracts/ICaptchaVerifier.cs +++ b/api/Services/Contracts/ICaptchaVerifier.cs @@ -1,9 +1,9 @@ -namespace Api.Services.Contracts -{ - public sealed record CaptchaVerdict(bool Success, string? Error, double? Score); +using Api.Services.Contracts.Models; +namespace Api.Services.Contracts +{ public interface ICaptchaVerifier { - Task VerifyAsync(string token, string? userIp, CancellationToken ct); + Task VerifyAsync(string token, string? userIp, CancellationToken ct); } } diff --git a/api/Services/Contracts/IEmailSender.cs b/api/Services/Contracts/IEmailSender.cs index a23e63b..3f774fc 100644 --- a/api/Services/Contracts/IEmailSender.cs +++ b/api/Services/Contracts/IEmailSender.cs @@ -1,4 +1,4 @@ -using Api.Models; +using Api.Requests; namespace Api.Services.Contracts { diff --git a/api/Services/Contracts/Models/CaptchaVerdictModel.cs b/api/Services/Contracts/Models/CaptchaVerdictModel.cs new file mode 100644 index 0000000..5dfc458 --- /dev/null +++ b/api/Services/Contracts/Models/CaptchaVerdictModel.cs @@ -0,0 +1,4 @@ +namespace Api.Services.Contracts.Models +{ + public sealed record CaptchaVerdictModel(bool Success, string? Error, double? Score); +} diff --git a/api/Services/Contracts/Rag/ICvRagService.cs b/api/Services/Contracts/Rag/ICvRagService.cs deleted file mode 100644 index 11ae11d..0000000 --- a/api/Services/Contracts/Rag/ICvRagService.cs +++ /dev/null @@ -1,9 +0,0 @@ -using Api.Models.Rag; - -namespace api.Services.Contracts.Rag; - -public interface ICvRagService -{ - Task IngestCvAsync(IFormFile file, bool gdprConsent, CancellationToken ct); - Task MatchJobAsync(JobMatchRequest request, CancellationToken ct); -} diff --git a/api/Services/Contracts/Rag/IPdfTextExtractor.cs b/api/Services/Contracts/Rag/IPdfTextExtractor.cs deleted file mode 100644 index d1435fd..0000000 --- a/api/Services/Contracts/Rag/IPdfTextExtractor.cs +++ /dev/null @@ -1,6 +0,0 @@ -namespace Api.Services.Contracts.Rag; - -public interface IPdfTextExtractor -{ - string ExtractText(Stream pdfStream); -} diff --git a/api/Services/Rag/CvRagService.cs b/api/Services/Rag/CvRagService.cs deleted file mode 100644 index 952a20b..0000000 --- a/api/Services/Rag/CvRagService.cs +++ /dev/null @@ -1,165 +0,0 @@ -using api.Services.Contracts.Rag; -using Api.Models.Rag; -using Api.Services.Contracts.Rag; -using Api.Settings; -using Microsoft.Extensions.Options; -using System.Text.Json; - -namespace Api.Services.Rag; - -public sealed class CvRagService : ICvRagService -{ - private readonly IPdfTextExtractor _pdfTextExtractor; - private readonly ITextChunker _textChunker; - private readonly IAiRagClient _openAi; - private readonly ICvVectorStore _store; - private readonly IJobTextExtractor _jobTextExtractor; - private readonly RagSettings _settings; - private readonly ILogger _logger; - - public CvRagService( - IPdfTextExtractor pdfTextExtractor, - ITextChunker textChunker, - IAiRagClient openAi, - ICvVectorStore store, - IJobTextExtractor jobTextExtractor, - IOptions options, - ILogger logger) - { - _pdfTextExtractor = pdfTextExtractor; - _textChunker = textChunker; - _openAi = openAi; - _store = store; - _jobTextExtractor = jobTextExtractor; - _settings = options.Value; - _logger = logger; - } - - public async Task IngestCvAsync(IFormFile file, bool gdprConsent, CancellationToken ct) - { - if (!gdprConsent) throw new InvalidOperationException("GDPR consent is required."); - if (file.Length == 0) throw new InvalidOperationException("CV PDF is empty."); - if (file.Length > _settings.MaxPdfSizeMb * 1024L * 1024L) throw new InvalidOperationException($"PDF is too large. Max size is {_settings.MaxPdfSizeMb} MB."); - if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are accepted."); - - await using var stream = file.OpenReadStream(); - var text = _pdfTextExtractor.ExtractText(stream); - if (text.Length < 80) throw new InvalidOperationException("Could not extract enough text from this PDF."); - - var documentId = $"cv_{Guid.NewGuid():N}"; - var expiresAt = DateTimeOffset.UtcNow.AddMinutes(Math.Max(10, _settings.CvTtlMinutes)); - var chunks = _textChunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap); - - var stored = new List(); - for (var i = 0; i < chunks.Count; i++) - { - ct.ThrowIfCancellationRequested(); - stored.Add(new StoredCvChunk - { - Id = Guid.NewGuid().ToString("N"), - DocumentId = documentId, - Text = chunks[i], - Embedding = await _openAi.CreateEmbeddingAsync(chunks[i], ct), - ChunkIndex = i, - ExpiresAt = expiresAt - }); - } - - _store.Save(documentId, stored); - var summary = await SummarizeCvAsync(text, ct); - return new CvIngestResponse(documentId, stored.Count, text.Length, summary); - } - - public async Task MatchJobAsync(JobMatchRequest request, CancellationToken ct) - { - if (!request.GdprConsent) throw new InvalidOperationException("GDPR consent is required."); - if (string.IsNullOrWhiteSpace(request.CvDocumentId)) throw new InvalidOperationException("Missing CV document id."); - - var cvChunks = _store.Get(request.CvDocumentId); - if (cvChunks.Count == 0) throw new InvalidOperationException("CV context was not found or has expired. Upload the CV again."); - - var jobText = await _jobTextExtractor.ExtractAsync(request.JobUrl, request.JobDescription, ct); - if (jobText.Length < 80) throw new InvalidOperationException("Could not extract enough job text. Paste the job description manually."); - - var jobEmbedding = await _openAi.CreateEmbeddingAsync(jobText, ct); - var retrieved = _store.Search(request.CvDocumentId, jobEmbedding, _settings.TopK); - var cvContext = string.Join("\n\n", retrieved.Select(x => $"CV chunk {x.ChunkIndex} | similarity {x.Score:0.000}:\n{x.Text}")); - - var systemPrompt = "You are a strict senior technical recruiter and AI CV matcher. Return only valid JSON. Do not invent candidate experience. Use only the supplied CV context and job text."; - var userPrompt = $$""" -Compare the candidate CV context with the job description. -Return this JSON shape exactly: -{ - "score": 0, - "summary": "short direct assessment", - "strengths": ["strength 1"], - "gaps": ["gap 1"], - "recommendations": ["action 1"], - "evidence": ["short CV evidence quote or paraphrase"] -} -Score must be 0-100. - -CV CONTEXT: -{{cvContext}} - -JOB DESCRIPTION: -{{jobText}} -"""; - - var content = await _openAi.CreateChatCompletionAsync(systemPrompt, userPrompt, ct); - var response = ParseMatchResponse(content); - if (response.Evidence.Count == 0) - { - response.Evidence = retrieved.Select(x => x.Text.Length > 280 ? x.Text[..280] + "..." : x.Text).ToList(); - } - return response; - } - - private async Task SummarizeCvAsync(string cvText, CancellationToken ct) - { - try - { - var shortened = cvText.Length > 8000 ? cvText[..8000] : cvText; - var content = await _openAi.CreateChatCompletionAsync( - "Return only valid JSON.", - $$""" -Summarize this CV in one concise sentence. Return JSON: { "summary": "..." } - -CV: -{{shortened}} -""", - ct); - using var doc = JsonDocument.Parse(content); - return doc.RootElement.TryGetProperty("summary", out var summary) ? summary.GetString() ?? "CV indexed." : "CV indexed."; - } - catch (Exception ex) - { - _logger.LogWarning(ex, "CV summary failed"); - return "CV indexed."; - } - } - - private static JobMatchResponse ParseMatchResponse(string content) - { - try - { - var response = JsonSerializer.Deserialize(content, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }) ?? new JobMatchResponse(); - response.Score = Math.Clamp(response.Score, 0, 100); - response.Strengths ??= []; - response.Gaps ??= []; - response.Recommendations ??= []; - response.Evidence ??= []; - return response; - } - catch - { - return new JobMatchResponse - { - Score = 0, - Summary = "The AI response could not be parsed. Check logs and prompt output.", - Gaps = ["Invalid JSON returned by the model."], - Evidence = [] - }; - } - } -} diff --git a/api/Services/Rag/InMemoryCvVectorStore.cs b/api/Services/Rag/InMemoryCvVectorStore.cs deleted file mode 100644 index 4cc3277..0000000 --- a/api/Services/Rag/InMemoryCvVectorStore.cs +++ /dev/null @@ -1,79 +0,0 @@ -using Api.Models.Rag; - -namespace Api.Services.Rag; - -public interface ICvVectorStore -{ - void Save(string documentId, IEnumerable chunks); - IReadOnlyList Get(string documentId); - IReadOnlyList Search(string documentId, float[] queryEmbedding, int topK); -} - -public sealed class InMemoryCvVectorStore : ICvVectorStore -{ - private readonly object _lock = new(); - private readonly Dictionary> _store = new(StringComparer.OrdinalIgnoreCase); - - public void Save(string documentId, IEnumerable chunks) - { - lock (_lock) - { - CleanupExpiredUnsafe(); - _store[documentId] = chunks.ToList(); - } - } - - public IReadOnlyList Get(string documentId) - { - lock (_lock) - { - CleanupExpiredUnsafe(); - return _store.TryGetValue(documentId, out var chunks) ? chunks.ToList() : []; - } - } - - public IReadOnlyList Search(string documentId, float[] queryEmbedding, int topK) - { - var chunks = Get(documentId); - if (chunks.Count == 0) return []; - - return chunks - .Select(chunk => new RetrievedCvChunk - { - Text = chunk.Text, - ChunkIndex = chunk.ChunkIndex, - Score = CosineSimilarity(queryEmbedding, chunk.Embedding) - }) - .OrderByDescending(x => x.Score) - .Take(Math.Clamp(topK, 1, 12)) - .ToList(); - } - - private void CleanupExpiredUnsafe() - { - var now = DateTimeOffset.UtcNow; - foreach (var key in _store.Where(x => x.Value.All(c => c.ExpiresAt <= now)).Select(x => x.Key).ToList()) - { - _store.Remove(key); - } - } - - private static double CosineSimilarity(float[] a, float[] b) - { - if (a.Length != b.Length || a.Length == 0) return 0; - - double dot = 0; - double magA = 0; - double magB = 0; - - for (var i = 0; i < a.Length; i++) - { - dot += a[i] * b[i]; - magA += a[i] * a[i]; - magB += b[i] * b[i]; - } - - if (magA == 0 || magB == 0) return 0; - return dot / (Math.Sqrt(magA) * Math.Sqrt(magB)); - } -} diff --git a/api/Services/Rag/OpenAiRagClient.cs b/api/Services/Rag/OpenAiRagClient.cs deleted file mode 100644 index d914a40..0000000 --- a/api/Services/Rag/OpenAiRagClient.cs +++ /dev/null @@ -1,99 +0,0 @@ -using System.Net.Http.Headers; -using System.Text; -using System.Text.Json; -using System.Text.Json.Serialization; -using Api.Services.Contracts.Rag; -using Api.Settings; -using Microsoft.Extensions.Options; - -namespace Api.Services.Rag; - -public sealed class OpenAiRagClient : IAiRagClient -{ - private readonly HttpClient _httpClient; - private readonly OpenAiSettings _settings; - private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web) - { - DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull - }; - - public OpenAiRagClient(HttpClient httpClient, IOptions options) - { - _httpClient = httpClient; - _settings = options.Value; - - if (!string.IsNullOrWhiteSpace(_settings.ApiKey)) - { - _httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", _settings.ApiKey); - } - - _httpClient.Timeout = TimeSpan.FromSeconds(Math.Max(15, _settings.TimeoutSeconds)); - _httpClient.BaseAddress = new Uri("https://api.openai.com/v1/"); - } - - public async Task CreateEmbeddingAsync(string input, CancellationToken ct) - { - EnsureConfigured(); - var payload = new { model = _settings.EmbeddingModel, input }; - using var response = await _httpClient.PostAsync("embeddings", ToJson(payload), ct); - var json = await response.Content.ReadAsStringAsync(ct); - if (!response.IsSuccessStatusCode) - { - throw new InvalidOperationException($"OpenAI embeddings request failed: {(int)response.StatusCode} {json}"); - } - - using var document = JsonDocument.Parse(json); - var embedding = document.RootElement.GetProperty("data")[0].GetProperty("embedding"); - var result = new float[embedding.GetArrayLength()]; - var i = 0; - foreach (var value in embedding.EnumerateArray()) - { - result[i++] = value.GetSingle(); - } - return result; - } - - public async Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, CancellationToken ct) - { - EnsureConfigured(); - var payload = new - { - model = _settings.ChatModel, - temperature = 0.2, - response_format = new { type = "json_object" }, - messages = new[] - { - new { role = "system", content = systemPrompt }, - new { role = "user", content = userPrompt } - } - }; - - using var response = await _httpClient.PostAsync("chat/completions", ToJson(payload), ct); - var json = await response.Content.ReadAsStringAsync(ct); - if (!response.IsSuccessStatusCode) - { - throw new InvalidOperationException($"OpenAI chat request failed: {(int)response.StatusCode} {json}"); - } - - using var document = JsonDocument.Parse(json); - return document.RootElement - .GetProperty("choices")[0] - .GetProperty("message") - .GetProperty("content") - .GetString() ?? "{}"; - } - - private void EnsureConfigured() - { - if (string.IsNullOrWhiteSpace(_settings.ApiKey)) - { - throw new InvalidOperationException("OpenAI API key is not configured. Set OpenAI__ApiKey."); - } - } - - private static StringContent ToJson(T payload) => new( - JsonSerializer.Serialize(payload, JsonOptions), - Encoding.UTF8, - "application/json" - ); -} diff --git a/api/Services/Rag/PdfTextExtractor.cs b/api/Services/Rag/PdfTextExtractor.cs deleted file mode 100644 index dbf8133..0000000 --- a/api/Services/Rag/PdfTextExtractor.cs +++ /dev/null @@ -1,29 +0,0 @@ -using Api.Services.Contracts.Rag; -using System.Text; -using UglyToad.PdfPig; - -namespace Api.Services.Rag; - -public sealed class PdfTextExtractor : IPdfTextExtractor -{ - public string ExtractText(Stream pdfStream) - { - using var document = PdfDocument.Open(pdfStream); - var builder = new StringBuilder(); - - foreach (var page in document.GetPages()) - { - builder.AppendLine(page.Text); - builder.AppendLine(); - } - - return NormalizeWhitespace(builder.ToString()); - } - - private static string NormalizeWhitespace(string value) - { - if (string.IsNullOrWhiteSpace(value)) return string.Empty; - var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries); - return string.Join(' ', parts).Trim(); - } -} diff --git a/api/Services/RecaptchaVerifier.cs b/api/Services/RecaptchaVerifier.cs index 973c071..7a2f417 100644 --- a/api/Services/RecaptchaVerifier.cs +++ b/api/Services/RecaptchaVerifier.cs @@ -1,3 +1,4 @@ +using Api.Services.Contracts.Models; using Api.Services.Contracts; using Api.Settings; using Microsoft.Extensions.Options; @@ -17,14 +18,14 @@ namespace Api.Services _log = log; } - public async Task VerifyAsync(string token, string? userIp, CancellationToken ct) + public async Task VerifyAsync(string token, string? userIp, CancellationToken ct) { _log.LogDebug("Verifying captcha token for IP {Ip}", userIp ?? "unknown"); if (string.IsNullOrWhiteSpace(_opt.SecretKey)) { _log.LogWarning("Captcha verification attempted but SecretKey is not configured"); - return new CaptchaVerdict(false, "Captcha not configured", null); + return new CaptchaVerdictModel(false, "Captcha not configured", null); } var form = new Dictionary @@ -45,21 +46,21 @@ namespace Api.Services { _log.LogWarning("Captcha HTTP request failed with status {StatusCode} for IP {Ip}", (int)resp.StatusCode, userIp ?? "unknown"); - return new CaptchaVerdict(false, $"Captcha HTTP {(int)resp.StatusCode}", null); + return new CaptchaVerdictModel(false, $"Captcha HTTP {(int)resp.StatusCode}", null); } var data = await resp.Content.ReadFromJsonAsync(cancellationToken: ct); if (data is null) { _log.LogError("Failed to parse captcha response for IP {Ip}", userIp ?? "unknown"); - return new CaptchaVerdict(false, "Captcha parse error", null); + return new CaptchaVerdictModel(false, "Captcha parse error", null); } if (!data.success) { _log.LogWarning("Captcha verification failed for IP {Ip}. Score={Score}", userIp ?? "unknown", data.score); - return new CaptchaVerdict(false, "Captcha failed", data.score); + return new CaptchaVerdictModel(false, "Captcha failed", data.score); } // v3 score check (score is typically null for v2) @@ -67,7 +68,7 @@ namespace Api.Services { _log.LogWarning("Captcha score {Score} below minimum {MinScore} for IP {Ip}", score, _opt.MinimumScore, userIp ?? "unknown"); - return new CaptchaVerdict(false, "Captcha score too low", score); + return new CaptchaVerdictModel(false, "Captcha score too low", score); } // Optional strictness (usually v3): action/hostname checks @@ -76,7 +77,7 @@ namespace Api.Services { _log.LogWarning("Captcha action mismatch. Expected={Expected}, Actual={Actual}, IP={Ip}", _opt.ExpectedAction, data.action, userIp ?? "unknown"); - return new CaptchaVerdict(false, "Captcha action mismatch", data.score); + return new CaptchaVerdictModel(false, "Captcha action mismatch", data.score); } if (!string.IsNullOrWhiteSpace(_opt.ExpectedHostname) && @@ -84,12 +85,12 @@ namespace Api.Services { _log.LogWarning("Captcha hostname mismatch. Expected={Expected}, Actual={Actual}, IP={Ip}", _opt.ExpectedHostname, data.hostname, userIp ?? "unknown"); - return new CaptchaVerdict(false, "Captcha hostname mismatch", data.score); + return new CaptchaVerdictModel(false, "Captcha hostname mismatch", data.score); } _log.LogInformation("Captcha verified successfully for IP {Ip}. Score={Score}", userIp ?? "unknown", data.score); - return new CaptchaVerdict(true, null, data.score); + return new CaptchaVerdictModel(true, null, data.score); } private sealed class RecaptchaResponse diff --git a/api/Services/SmtpEmailSender.cs b/api/Services/SmtpEmailSender.cs index cc61d3c..7bbfc5f 100644 --- a/api/Services/SmtpEmailSender.cs +++ b/api/Services/SmtpEmailSender.cs @@ -1,5 +1,5 @@ using Api.Services.Contracts; -using Api.Models; +using Api.Requests; using Microsoft.Extensions.Options; using MailKit.Net.Smtp; using MailKit.Security; diff --git a/api/api.csproj b/api/api.csproj index 4af1b72..3e988b6 100644 --- a/api/api.csproj +++ b/api/api.csproj @@ -1,4 +1,4 @@ - + net10.0 @@ -10,6 +10,7 @@ false Linux true + Api @@ -18,7 +19,6 @@ - diff --git a/api/appsettings.json b/api/appsettings.json index a4c7304..aa57e13 100644 --- a/api/appsettings.json +++ b/api/appsettings.json @@ -106,18 +106,8 @@ "FromEmail": "", "SubjectPrefix": "[File Download]" }, - "OpenAI": { - "ApiKey": "", - "ChatModel": "gpt-4o-mini", - "EmbeddingModel": "text-embedding-3-small", - "TimeoutSeconds": 60 - }, - "Rag": { - "MaxPdfSizeMb": 5, - "ChunkSize": 900, - "ChunkOverlap": 150, - "CvTtlMinutes": 60, - "MaxJobTextChars": 20000, - "TopK": 6 + "CvMatcherApi": { + "BaseUrl": "", + "InternalApiKey": "" } -} +} \ No newline at end of file diff --git a/cv-matcher-api/Controllers/CvController.cs b/cv-matcher-api/Controllers/CvController.cs new file mode 100644 index 0000000..3f0a382 --- /dev/null +++ b/cv-matcher-api/Controllers/CvController.cs @@ -0,0 +1,73 @@ +using Api.Requests; +using Api.Services.Contracts; +using Microsoft.AspNetCore.Mvc; + +namespace Api.Controllers; + +[ApiController] +[Route("api/cv")] +public sealed class CvController : ControllerBase +{ + private readonly ICvMatcherService _service; + private readonly ILogger _logger; + + public CvController(ICvMatcherService service, ILogger logger) + { + _service = service; + _logger = logger; + } + + [HttpPost("upload")] + [RequestSizeLimit(10 * 1024 * 1024)] + public async Task Upload([FromForm(Name = "cv")] IFormFile? cv, [FromForm] bool gdprConsent, CancellationToken ct) + { + try + { + if (cv is null) return BadRequest(new { error = "Missing CV PDF." }); + _logger.LogInformation("CV upload received. FileName={FileName}, Size={SizeBytes}, GdprConsent={GdprConsent}", cv.FileName, cv.Length, gdprConsent); + var result = await _service.UploadCvAsync(cv, gdprConsent, ct); + _logger.LogInformation("CV upload processed. CvDocumentId={CvDocumentId}, Cached={Cached}", result.DocumentId, result.Cached); + return Ok(result); + } + catch (InvalidOperationException ex) + { + _logger.LogWarning(ex, "Invalid CV upload request."); + return BadRequest(new { error = ex.Message }); + } + } + + [HttpPost("find-jobs")] + public async Task FindJobs([FromBody] FindJobsRequest request, CancellationToken ct) + { + try + { + _logger.LogInformation("Find jobs request received. CvDocumentId={CvDocumentId}, TopK={TopK}", request.CvDocumentId, request.TopK); + var result = await _service.FindJobsAsync(request, ct); + _logger.LogInformation("Find jobs completed. CvDocumentId={CvDocumentId}, ResultCount={ResultCount}", request.CvDocumentId, result.Jobs.Count); + return Ok(result); + } + catch (InvalidOperationException ex) + { + _logger.LogWarning(ex, "Invalid find jobs request."); + return BadRequest(new { error = ex.Message }); + } + } + + [HttpPost("match-job")] + public async Task MatchJob([FromBody] MatchJobRequest request, CancellationToken ct) + { + try + { + _logger.LogInformation("Match job request received. CvDocumentId={CvDocumentId}, HasJobUrl={HasJobUrl}, HasJobDescription={HasJobDescription}, EmailRequested={EmailRequested}", + request.CvDocumentId, !string.IsNullOrWhiteSpace(request.JobUrl), !string.IsNullOrWhiteSpace(request.JobDescription), !string.IsNullOrWhiteSpace(request.Email)); + var result = await _service.MatchJobAsync(request, ct); + _logger.LogInformation("Match job completed. CvDocumentId={CvDocumentId}, Score={Score}, Cached={Cached}", request.CvDocumentId, result.Score, result.Cached); + return Ok(result); + } + catch (InvalidOperationException ex) + { + _logger.LogWarning(ex, "Invalid match job request."); + return BadRequest(new { error = ex.Message }); + } + } +} diff --git a/cv-matcher-api/Database/schema.sql b/cv-matcher-api/Database/schema.sql new file mode 100644 index 0000000..05c96a4 --- /dev/null +++ b/cv-matcher-api/Database/schema.sql @@ -0,0 +1,25 @@ +IF OBJECT_ID('dbo.CvMatchResults', 'U') IS NULL +BEGIN + CREATE TABLE dbo.CvMatchResults ( + Id NVARCHAR(64) NOT NULL CONSTRAINT PK_CvMatchResults PRIMARY KEY, + CvDocumentId NVARCHAR(64) NOT NULL, + JobDocumentId NVARCHAR(64) NOT NULL, + ResultJson NVARCHAR(MAX) NOT NULL, + Score INT NOT NULL, + CreatedAt DATETIME2 NOT NULL CONSTRAINT DF_CvMatchResults_CreatedAt DEFAULT SYSUTCDATETIME() + ); + CREATE UNIQUE INDEX UX_CvMatchResults_CvJob ON dbo.CvMatchResults(CvDocumentId, JobDocumentId); +END +GO + +IF OBJECT_ID('dbo.CvMatcherChatCache', 'U') IS NULL +BEGIN + CREATE TABLE dbo.CvMatcherChatCache ( + CacheKey NVARCHAR(64) NOT NULL CONSTRAINT PK_CvMatcherChatCache PRIMARY KEY, + Model NVARCHAR(120) NOT NULL, + Temperature DECIMAL(4,2) NOT NULL, + ResponseText NVARCHAR(MAX) NOT NULL, + CreatedAt DATETIME2 NOT NULL CONSTRAINT DF_CvMatcherChatCache_CreatedAt DEFAULT SYSUTCDATETIME() + ); +END +GO diff --git a/cv-matcher-api/Dockerfile b/cv-matcher-api/Dockerfile new file mode 100644 index 0000000..13cb6d0 --- /dev/null +++ b/cv-matcher-api/Dockerfile @@ -0,0 +1,15 @@ +FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS base +WORKDIR /app +EXPOSE 8080 + +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build +WORKDIR /src +COPY ["cv-matcher-api.csproj", "./"] +RUN dotnet restore "cv-matcher-api.csproj" +COPY . . +RUN dotnet publish "cv-matcher-api.csproj" -c Release -o /app/publish /p:UseAppHost=false + +FROM base AS final +WORKDIR /app +COPY --from=build /app/publish . +ENTRYPOINT ["dotnet", "cv-matcher-api.dll"] diff --git a/cv-matcher-api/Program.cs b/cv-matcher-api/Program.cs new file mode 100644 index 0000000..ef641ec --- /dev/null +++ b/cv-matcher-api/Program.cs @@ -0,0 +1,283 @@ +using Azure.Identity; +using Api.Services; +using Api.Services.Contracts; +using Api.Settings; +using Microsoft.AspNetCore.Diagnostics; +using Serilog; +using System.Reflection; + +DotNetEnv.Env.Load(); + +try +{ + var builder = WebApplication.CreateBuilder(args); + var appVersion = Assembly.GetExecutingAssembly() + .GetCustomAttribute()? + .InformationalVersion + ?? Assembly.GetExecutingAssembly().GetName().Version?.ToString() + ?? "unknown"; + + builder.Host.UseSerilog((context, services, configuration) => + { + configuration + .ReadFrom.Configuration(context.Configuration) + .ReadFrom.Services(services) + .Enrich.FromLogContext() + .Enrich.WithMachineName() + .Enrich.WithEnvironmentName() + .Enrich.WithProperty("Service", "cv-matcher-api") + .Enrich.WithProperty("AppVersion", appVersion) + .WriteTo.Console(new Serilog.Formatting.Json.JsonFormatter()); + }); + + Log.Information("Starting {Service} version {AppVersion}", "cv-matcher-api", appVersion); + + // -------------------- + // Azure Key Vault Configuration + // -------------------- + var keyVaultUri = builder.Configuration["KeyVault:VaultUri"]; + var keyVaultEnabled = builder.Configuration.GetValue("KeyVault:Enabled"); + + if (keyVaultEnabled && !string.IsNullOrWhiteSpace(keyVaultUri)) + { + Log.Information("Loading configuration from Azure Key Vault: {VaultUri}", keyVaultUri); + + try + { + builder.Configuration.AddAzureKeyVault( + new Uri(keyVaultUri), + new DefaultAzureCredential()); + + Log.Information("Azure Key Vault configuration loaded successfully"); + } + catch (Exception ex) + { + Log.Warning(ex, "Failed to load Azure Key Vault configuration. Continuing with other configuration sources."); + } + } + else + { + Log.Information("Azure Key Vault is disabled or not configured"); + } + + builder.Services.Configure(builder.Configuration.GetSection("RagApi")); + builder.Services.Configure(builder.Configuration.GetSection("InternalApi")); + builder.Services.Configure(builder.Configuration.GetSection("Ai")); + builder.Services.Configure(builder.Configuration.GetSection("Matcher")); + builder.Services.Configure(builder.Configuration.GetSection("Smtp")); + + builder.Services.AddHttpClient(); + builder.Services.AddHttpClient(); + builder.Services.AddHttpClient(); + builder.Services.AddSingleton(); + builder.Services.AddScoped(); + builder.Services.AddSingleton(); + + builder.Services.AddControllers(); + builder.Services.AddEndpointsApiExplorer(); + builder.Services.AddSwaggerGen(); + + var app = builder.Build(); + + var logger = app.Services.GetRequiredService>(); + logger.LogInformation("API starting up..."); + logger.LogInformation("Environment: {Environment}", app.Environment.EnvironmentName); + + // Log all environment variables and configuration settings at startup + // Can be controlled via appsettings: "Logging:LogEnvironmentOnStartup": true + var logEnvironmentOnStartup = app.Configuration.GetValue("Logging:LogEnvironmentOnStartup", defaultValue: true); + if (logEnvironmentOnStartup) + { + LogEnvironmentSettings(logger, app.Configuration, app.Environment); + } + + using (var scope = app.Services.CreateScope()) + { + var repository = scope.ServiceProvider.GetRequiredService(); + await repository.InitializeAsync(CancellationToken.None); + } + + app.UseSerilogRequestLogging(options => + { + options.MessageTemplate = "HTTP {RequestMethod} {RequestPath} responded {StatusCode} in {Elapsed:0.0000} ms"; + options.EnrichDiagnosticContext = (diagnosticContext, httpContext) => + { + diagnosticContext.Set("RequestHost", httpContext.Request.Host.Value); + diagnosticContext.Set("RequestScheme", httpContext.Request.Scheme); + diagnosticContext.Set("RemoteIP", httpContext.Connection.RemoteIpAddress?.ToString()); + diagnosticContext.Set("UserAgent", httpContext.Request.Headers.UserAgent.ToString()); + }; + }); + + app.UseExceptionHandler(errorApp => + { + errorApp.Run(async context => + { + var feature = context.Features.Get(); + var logger = context.RequestServices.GetRequiredService>(); + if (feature?.Error is not null) + { + logger.LogError(feature.Error, "Unhandled exception in {Service}", "cv-matcher-api"); + } + + context.Response.StatusCode = StatusCodes.Status500InternalServerError; + context.Response.ContentType = "application/json"; + await context.Response.WriteAsJsonAsync(new { error = "Unexpected server error." }); + }); + }); + + app.Use(async (context, next) => + { + var settings = context.RequestServices.GetRequiredService>().Value; + if (settings.RequireApiKey) + { + var header = context.Request.Headers["X-Internal-Api-Key"].ToString(); + if (string.IsNullOrWhiteSpace(settings.ApiKey) || header != settings.ApiKey) + { + var logger = context.RequestServices.GetRequiredService>(); + logger.LogWarning("Rejected unauthorized internal API call. Path={Path}, RemoteIP={RemoteIP}", context.Request.Path, context.Connection.RemoteIpAddress?.ToString()); + context.Response.StatusCode = StatusCodes.Status401Unauthorized; + await context.Response.WriteAsJsonAsync(new { error = "Unauthorized internal API call." }); + return; + } + } + + await next(); + }); + + // Swagger (typically only in Development) + if (app.Environment.IsDevelopment()) + { + app.UseSwagger(); + app.UseSwaggerUI(options => + { + options.DocumentTitle = "cv-matcher-api"; + options.SwaggerEndpoint("/swagger/v1/swagger.json", "cv-matcher-api v1"); + options.RoutePrefix = "swagger"; + }); + } + + app.MapControllers(); + app.MapGet("/health", () => Results.Ok(new { status = "ok", service = "cv-matcher-api", version = appVersion, timeUtc = DateTimeOffset.UtcNow })); + + Log.Information("{Service} startup complete", "cv-matcher-api"); + app.Run(); +} +catch (Exception ex) +{ + Log.Fatal(ex, "cv-matcher-api terminated unexpectedly"); +} +finally +{ + Log.Information("Shutting down cv-matcher-api"); + Log.CloseAndFlush(); +} + +/// +/// Logs all environment variables and configuration settings at startup for diagnostics. +/// +static void LogEnvironmentSettings(Microsoft.Extensions.Logging.ILogger logger, IConfiguration configuration, IWebHostEnvironment environment) +{ + logger.LogInformation("==================== ENVIRONMENT SETTINGS ===================="); + + // Environment Information + logger.LogInformation("Application Name: {ApplicationName}", environment.ApplicationName); + logger.LogInformation("Environment Name: {EnvironmentName}", environment.EnvironmentName); + logger.LogInformation("Content Root Path: {ContentRootPath}", environment.ContentRootPath); + logger.LogInformation("Web Root Path: {WebRootPath}", environment.WebRootPath); + + // Environment Variables + logger.LogInformation("-------------- Environment Variables --------------"); + var envVars = Environment.GetEnvironmentVariables(); + var sortedEnvVars = new SortedDictionary(); + + foreach (System.Collections.DictionaryEntry entry in envVars) + { + var key = entry.Key?.ToString() ?? string.Empty; + var value = entry.Value?.ToString() ?? string.Empty; + + // Mask sensitive values (passwords, secrets, tokens, keys) but show last 4 characters + if (IsSensitiveKey(key)) + { + value = MaskValueWithLastChars(value); + } + + sortedEnvVars[key] = value; + } + + foreach (var kvp in sortedEnvVars) + { + logger.LogInformation(" {Key} = {Value}", kvp.Key, kvp.Value); + } + + // Configuration Settings + logger.LogInformation("-------------- Configuration Settings --------------"); + LogConfigurationRecursive(logger, configuration.GetChildren(), ""); + + logger.LogInformation("==========================================================="); +} + +/// +/// Recursively logs configuration settings with hierarchy. +/// +static void LogConfigurationRecursive(Microsoft.Extensions.Logging.ILogger logger, IEnumerable sections, string prefix) +{ + foreach (var section in sections) + { + var key = string.IsNullOrEmpty(prefix) ? section.Key : $"{prefix}:{section.Key}"; + + if (section.Value != null) + { + var value = section.Value; + + // Mask sensitive configuration values but show last 4 characters + if (IsSensitiveKey(key)) + { + value = MaskValueWithLastChars(value); + } + + logger.LogInformation(" {Key} = {Value}", key, value); + } + + // Recurse into child sections + if (section.GetChildren().Any()) + { + LogConfigurationRecursive(logger, section.GetChildren(), key); + } + } +} + +/// +/// Checks if a configuration key contains sensitive information. +/// +static bool IsSensitiveKey(string key) +{ + return key.Contains("Password", StringComparison.OrdinalIgnoreCase) || + key.Contains("Secret", StringComparison.OrdinalIgnoreCase) || + key.Contains("Token", StringComparison.OrdinalIgnoreCase) || + key.Contains("Key", StringComparison.OrdinalIgnoreCase) || + key.Contains("ConnectionString", StringComparison.OrdinalIgnoreCase); +} + +/// +/// Masks a sensitive value but shows the last 4 characters for verification. +/// +/// The value to mask. +/// Masked value showing last 4 characters (e.g., "***MASKED***...abcd") +static string MaskValueWithLastChars(string value) +{ + if (string.IsNullOrEmpty(value)) + { + return "***NOT SET***"; + } + + // If value is too short, just mask it completely + if (value.Length <= 4) + { + return "***MASKED***"; + } + + // Show last 4 characters + var lastChars = value.Substring(value.Length - 4); + return $"***MASKED***...{lastChars}"; +} \ No newline at end of file diff --git a/cv-matcher-api/Properties/launchSettings.json b/cv-matcher-api/Properties/launchSettings.json new file mode 100644 index 0000000..03a22f4 --- /dev/null +++ b/cv-matcher-api/Properties/launchSettings.json @@ -0,0 +1,12 @@ +{ + "profiles": { + "cv-matcher-api": { + "commandName": "Project", + "launchBrowser": true, + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "applicationUrl": "https://localhost:58423;http://localhost:58425" + } + } +} \ No newline at end of file diff --git a/cv-matcher-api/Requests/FindJobsRequest.cs b/cv-matcher-api/Requests/FindJobsRequest.cs new file mode 100644 index 0000000..ef6c742 --- /dev/null +++ b/cv-matcher-api/Requests/FindJobsRequest.cs @@ -0,0 +1,9 @@ +namespace Api.Requests +{ + public sealed class FindJobsRequest + { + public required string CvDocumentId { get; init; } + public int? TopK { get; init; } + public string? Email { get; init; } + } +} diff --git a/cv-matcher-api/Requests/MatchJobRequest.cs b/cv-matcher-api/Requests/MatchJobRequest.cs new file mode 100644 index 0000000..429ae16 --- /dev/null +++ b/cv-matcher-api/Requests/MatchJobRequest.cs @@ -0,0 +1,11 @@ +namespace Api.Requests +{ + public sealed class MatchJobRequest + { + public string? CvDocumentId { get; set; } + public string? JobUrl { get; set; } + public string? JobDescription { get; set; } + public bool GdprConsent { get; set; } + public string? Email { get; set; } + } +} diff --git a/cv-matcher-api/Requests/RagSearchRequest.cs b/cv-matcher-api/Requests/RagSearchRequest.cs new file mode 100644 index 0000000..04a1089 --- /dev/null +++ b/cv-matcher-api/Requests/RagSearchRequest.cs @@ -0,0 +1,9 @@ +namespace Api.Requests +{ + public sealed class RagSearchRequest + { + public required string QueryText { get; init; } + public IReadOnlyList? TargetDocumentTypes { get; init; } + public int? TopK { get; init; } + } +} diff --git a/cv-matcher-api/Responses/CvUploadResponse.cs b/cv-matcher-api/Responses/CvUploadResponse.cs new file mode 100644 index 0000000..726a9e0 --- /dev/null +++ b/cv-matcher-api/Responses/CvUploadResponse.cs @@ -0,0 +1,14 @@ +namespace Api.Responses +{ + public sealed class CvUploadResponse + { + public required string DocumentId { get; init; } + public required string TextHash { get; init; } + public required string DocumentType { get; init; } + public required string Title { get; init; } + public int Chunks { get; init; } + public int Characters { get; init; } + public bool Cached { get; init; } + public string Summary { get; init; } = "CV indexed successfully."; + } +} diff --git a/cv-matcher-api/Responses/FindJobsResponse.cs b/cv-matcher-api/Responses/FindJobsResponse.cs new file mode 100644 index 0000000..0adc942 --- /dev/null +++ b/cv-matcher-api/Responses/FindJobsResponse.cs @@ -0,0 +1,8 @@ +namespace Api.Responses +{ + public sealed class FindJobsResponse + { + public required string CvDocumentId { get; init; } + public IReadOnlyList Jobs { get; init; } = []; + } +} diff --git a/cv-matcher-api/Responses/JobMatchResponse.cs b/cv-matcher-api/Responses/JobMatchResponse.cs new file mode 100644 index 0000000..2f54ac3 --- /dev/null +++ b/cv-matcher-api/Responses/JobMatchResponse.cs @@ -0,0 +1,15 @@ +namespace Api.Responses +{ + public sealed class JobMatchResponse + { + public int Score { get; set; } + public string Summary { get; set; } = string.Empty; + public List Strengths { get; set; } = []; + public List Gaps { get; set; } = []; + public List Recommendations { get; set; } = []; + public List Evidence { get; set; } = []; + public bool Cached { get; set; } + public string? JobDocumentId { get; set; } + public string? JobUrl { get; set; } + } +} diff --git a/cv-matcher-api/Responses/RagIndexResponse.cs b/cv-matcher-api/Responses/RagIndexResponse.cs new file mode 100644 index 0000000..a62eb69 --- /dev/null +++ b/cv-matcher-api/Responses/RagIndexResponse.cs @@ -0,0 +1,14 @@ +namespace Api.Responses +{ + public sealed class RagIndexResponse + { + public required string DocumentId { get; init; } + public required string TextHash { get; init; } + public required string DocumentType { get; init; } + public double DocumentTypeConfidence { get; init; } + public required string Title { get; init; } + public int Chunks { get; init; } + public int Characters { get; init; } + public bool Cached { get; init; } + } +} diff --git a/cv-matcher-api/Responses/RagSearchResponse.cs b/cv-matcher-api/Responses/RagSearchResponse.cs new file mode 100644 index 0000000..9a72539 --- /dev/null +++ b/cv-matcher-api/Responses/RagSearchResponse.cs @@ -0,0 +1,34 @@ +namespace Api.Responses +{ + public sealed class RagSearchResponse + { + public IReadOnlyList Results { get; init; } = []; + } + + public sealed class RagDocumentDetails + { + public required string Id { get; init; } + public required string DocumentType { get; init; } + public required string Title { get; init; } + public string? SourceUrl { get; init; } + public required string Text { get; init; } + public required string TextHash { get; init; } + } + public sealed class RagSearchDocumentResult + { + public required string DocumentId { get; init; } + public required string DocumentType { get; init; } + public required string Title { get; init; } + public string? SourceUrl { get; init; } + public double Score { get; init; } + public IReadOnlyList MatchedChunks { get; init; } = []; + } + + public sealed class RagSearchChunkResult + { + public required string ChunkId { get; init; } + public int ChunkIndex { get; init; } + public required string Text { get; init; } + public double Score { get; init; } + } +} diff --git a/cv-matcher-api/Services/Contracts/ICvMatcherService.cs b/cv-matcher-api/Services/Contracts/ICvMatcherService.cs new file mode 100644 index 0000000..b1c4f17 --- /dev/null +++ b/cv-matcher-api/Services/Contracts/ICvMatcherService.cs @@ -0,0 +1,11 @@ +using Api.Requests; +using Api.Responses; + +namespace Api.Services.Contracts; + +public interface ICvMatcherService +{ + Task UploadCvAsync(IFormFile file, bool gdprConsent, CancellationToken ct); + Task MatchJobAsync(MatchJobRequest request, CancellationToken ct); + Task FindJobsAsync(FindJobsRequest request, CancellationToken ct); +} diff --git a/cv-matcher-api/Services/Contracts/IEmailService.cs b/cv-matcher-api/Services/Contracts/IEmailService.cs new file mode 100644 index 0000000..7eecca5 --- /dev/null +++ b/cv-matcher-api/Services/Contracts/IEmailService.cs @@ -0,0 +1,6 @@ +namespace Api.Services.Contracts; + +public interface IEmailService +{ + Task SendMatchAsync(string? explicitTo, string subject, string body, CancellationToken ct); +} diff --git a/api/Services/Contracts/Rag/IJobTextExtractor.cs b/cv-matcher-api/Services/Contracts/IJobTextExtractor.cs similarity index 77% rename from api/Services/Contracts/Rag/IJobTextExtractor.cs rename to cv-matcher-api/Services/Contracts/IJobTextExtractor.cs index 1c70ddc..850521c 100644 --- a/api/Services/Contracts/Rag/IJobTextExtractor.cs +++ b/cv-matcher-api/Services/Contracts/IJobTextExtractor.cs @@ -1,4 +1,4 @@ -namespace Api.Services.Contracts.Rag; +namespace Api.Services.Contracts; public interface IJobTextExtractor { diff --git a/cv-matcher-api/Services/Contracts/IMatcherAiClient.cs b/cv-matcher-api/Services/Contracts/IMatcherAiClient.cs new file mode 100644 index 0000000..2a640b0 --- /dev/null +++ b/cv-matcher-api/Services/Contracts/IMatcherAiClient.cs @@ -0,0 +1,6 @@ +namespace Api.Services.Contracts; + +public interface IMatcherAiClient +{ + Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct); +} diff --git a/cv-matcher-api/Services/Contracts/IMatcherRepository.cs b/cv-matcher-api/Services/Contracts/IMatcherRepository.cs new file mode 100644 index 0000000..6fa427e --- /dev/null +++ b/cv-matcher-api/Services/Contracts/IMatcherRepository.cs @@ -0,0 +1,12 @@ +using Api.Responses; + +namespace Api.Services.Contracts; + +public interface IMatcherRepository +{ + Task InitializeAsync(CancellationToken ct); + Task GetMatchAsync(string cvDocumentId, string jobDocumentId, CancellationToken ct); + Task SaveMatchAsync(string cvDocumentId, string jobDocumentId, JobMatchResponse response, CancellationToken ct); + Task GetChatCompletionAsync(string cacheKey, CancellationToken ct); + Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct); +} diff --git a/cv-matcher-api/Services/Contracts/IRagApiClient.cs b/cv-matcher-api/Services/Contracts/IRagApiClient.cs new file mode 100644 index 0000000..2d823c6 --- /dev/null +++ b/cv-matcher-api/Services/Contracts/IRagApiClient.cs @@ -0,0 +1,12 @@ +using Api.Requests; +using Api.Responses; + +namespace Api.Services.Contracts; + +public interface IRagApiClient +{ + Task IndexCvPdfAsync(IFormFile file, CancellationToken ct); + Task IndexJobTextAsync(string text, string? url, string? title, CancellationToken ct); + Task GetDocumentAsync(string documentId, CancellationToken ct); + Task SearchAsync(RagSearchRequest request, CancellationToken ct); +} diff --git a/cv-matcher-api/Services/CvMatcherService.cs b/cv-matcher-api/Services/CvMatcherService.cs new file mode 100644 index 0000000..440670e --- /dev/null +++ b/cv-matcher-api/Services/CvMatcherService.cs @@ -0,0 +1,201 @@ +using System.Text.Json; +using Api.Requests; +using Api.Responses; +using Api.Services.Contracts; +using Api.Settings; +using Microsoft.Extensions.Options; + +namespace Api.Services; + +public sealed class CvMatcherService : ICvMatcherService +{ + private readonly IRagApiClient _rag; + private readonly IJobTextExtractor _jobTextExtractor; + private readonly IMatcherAiClient _ai; + private readonly IMatcherRepository _repository; + private readonly IEmailService _email; + private readonly MatcherSettings _settings; + + public CvMatcherService( + IRagApiClient rag, + IJobTextExtractor jobTextExtractor, + IMatcherAiClient ai, + IMatcherRepository repository, + IEmailService email, + IOptions options) + { + _rag = rag; + _jobTextExtractor = jobTextExtractor; + _ai = ai; + _repository = repository; + _email = email; + _settings = options.Value; + } + + public async Task UploadCvAsync(IFormFile file, bool gdprConsent, CancellationToken ct) + { + if (!gdprConsent) throw new InvalidOperationException("GDPR consent is required."); + var response = await _rag.IndexCvPdfAsync(file, ct); + return new CvUploadResponse + { + DocumentId = response.DocumentId, + TextHash = response.TextHash, + DocumentType = response.DocumentType, + Title = response.Title, + Chunks = response.Chunks, + Characters = response.Characters, + Cached = response.Cached, + Summary = response.Cached ? "CV already indexed. Cached data reused." : "CV indexed successfully." + }; + } + + public async Task FindJobsAsync(FindJobsRequest request, CancellationToken ct) + { + var cv = await _rag.GetDocumentAsync(request.CvDocumentId, ct) ?? throw new InvalidOperationException("CV document not found."); + if (!string.Equals(cv.DocumentType, "cv", StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException("The provided document is not a CV."); + } + + var search = await _rag.SearchAsync(new RagSearchRequest + { + QueryText = BuildCvSearchProfile(cv.Text), + TargetDocumentTypes = ["job"], + TopK = request.TopK ?? _settings.TopK + }, ct); + + var deepScoreLimit = Math.Clamp(_settings.DeepScoreTopN, 1, 10); + var jobs = new List(); + foreach (var result in search.Results.Take(deepScoreLimit)) + { + var job = await _rag.GetDocumentAsync(result.DocumentId, ct); + if (job is null) continue; + jobs.Add(await ScorePairAsync(cv, job, result.MatchedChunks.Select(x => x.Text).ToArray(), request.Email, ct)); + } + + return new FindJobsResponse { CvDocumentId = request.CvDocumentId, Jobs = jobs }; + } + + public async Task MatchJobAsync(MatchJobRequest request, CancellationToken ct) + { + if (!request.GdprConsent) throw new InvalidOperationException("GDPR consent is required."); + if (string.IsNullOrWhiteSpace(request.CvDocumentId)) throw new InvalidOperationException("Missing CV document id."); + + var cv = await _rag.GetDocumentAsync(request.CvDocumentId, ct) ?? throw new InvalidOperationException("CV document not found."); + var jobText = await _jobTextExtractor.ExtractAsync(request.JobUrl, request.JobDescription, ct); + if (jobText.Length < 80) throw new InvalidOperationException("Could not extract enough job text. Paste the job description manually."); + + var job = await _rag.IndexJobTextAsync(jobText, request.JobUrl, ExtractJobTitle(jobText), ct); + var jobDocument = await _rag.GetDocumentAsync(job.DocumentId, ct) ?? throw new InvalidOperationException("Indexed job document not found."); + + var search = await _rag.SearchAsync(new RagSearchRequest + { + QueryText = BuildCvSearchProfile(cv.Text), + TargetDocumentTypes = ["job"], + TopK = Math.Max(5, _settings.TopK) + }, ct); + + var matchedChunks = search.Results + .FirstOrDefault(x => x.DocumentId == job.DocumentId)? + .MatchedChunks.Select(x => x.Text).ToArray() ?? []; + + return await ScorePairAsync(cv, jobDocument, matchedChunks, request.Email, ct); + } + + private async Task ScorePairAsync(RagDocumentDetails cv, RagDocumentDetails job, IReadOnlyList evidenceChunks, string? email, CancellationToken ct) + { + var cached = await _repository.GetMatchAsync(cv.Id, job.Id, ct); + if (cached is not null) return cached; + + var cvText = Limit(cv.Text, 18000); + var jobText = Limit(job.Text, 14000); + var evidence = evidenceChunks.Count > 0 ? string.Join("\n\n", evidenceChunks.Take(4)) : Limit(job.Text, 4000); + + const string systemPrompt = """ + You are a strict CV-to-job matching engine. Return JSON only. Score realistically from 0 to 100. + Penalize missing required skills. Do not invent experience. Use concise business language. + JSON shape: {"score":number,"summary":"...","strengths":["..."],"gaps":["..."],"recommendations":["..."],"evidence":["..."]} + """; + + var userPrompt = $""" + CV: + {cvText} + + JOB: + {jobText} + + SEMANTICALLY MATCHED JOB EVIDENCE: + {evidence} + """; + + var json = await _ai.CreateChatCompletionAsync(systemPrompt, userPrompt, 0.2m, ct); + var result = ParseResult(json); + result.JobDocumentId = job.Id; + result.JobUrl = job.SourceUrl; + result.Cached = false; + await _repository.SaveMatchAsync(cv.Id, job.Id, result, ct); + + await _email.SendMatchAsync( + email, + $"MyAi.ro CV Match: {result.Score}% - {job.Title}", + BuildEmailBody(cv, job, result), + ct); + + return result; + } + + private static JobMatchResponse ParseResult(string json) + { + try + { + var parsed = JsonSerializer.Deserialize(json, new JsonSerializerOptions(JsonSerializerDefaults.Web)); + if (parsed is not null) return parsed; + } + catch + { + // Fall through to safe response. + } + + return new JobMatchResponse + { + Score = 0, + Summary = "The AI response could not be parsed as structured JSON.", + Recommendations = ["Inspect the raw model output and tune the scoring prompt."] + }; + } + + private static string BuildCvSearchProfile(string cvText) + { + var text = Limit(cvText, 10000); + return $"Candidate profile, skills, technologies, seniority, industry experience, project experience: {text}"; + } + + private static string ExtractJobTitle(string jobText) + { + var first = jobText.Split('.', '\n', '\r').Select(x => x.Trim()).FirstOrDefault(x => x.Length is > 8 and < 140); + return first ?? "Job description"; + } + + private static string Limit(string value, int max) => value.Length <= max ? value : value[..max]; + + private static string BuildEmailBody(RagDocumentDetails cv, RagDocumentDetails job, JobMatchResponse result) => $""" + CV Matcher result + + CV: {cv.Title} + Job: {job.Title} + Job URL: {job.SourceUrl ?? "N/A"} + Score: {result.Score}% + + Summary: + {result.Summary} + + Strengths: + - {string.Join("\n- ", result.Strengths)} + + Gaps: + - {string.Join("\n- ", result.Gaps)} + + Recommendations: + - {string.Join("\n- ", result.Recommendations)} + """; +} diff --git a/cv-matcher-api/Services/EmailService.cs b/cv-matcher-api/Services/EmailService.cs new file mode 100644 index 0000000..cc5eb51 --- /dev/null +++ b/cv-matcher-api/Services/EmailService.cs @@ -0,0 +1,46 @@ +using Api.Services.Contracts; +using Api.Settings; +using MailKit.Net.Smtp; +using MailKit.Security; +using Microsoft.Extensions.Options; +using MimeKit; + +namespace Api.Services; + +public sealed class EmailService : IEmailService +{ + private readonly SmtpSettings _settings; + private readonly ILogger _logger; + + public EmailService(IOptions options, ILogger logger) + { + _settings = options.Value; + _logger = logger; + } + + public async Task SendMatchAsync(string? explicitTo, string subject, string body, CancellationToken ct) + { + var to = !string.IsNullOrWhiteSpace(explicitTo) ? explicitTo : _settings.ToEmail; + if (string.IsNullOrWhiteSpace(_settings.Host) || string.IsNullOrWhiteSpace(to)) + { + _logger.LogInformation("SMTP is not configured. Skipping CV matcher email."); + return; + } + + var message = new MimeMessage(); + message.From.Add(MailboxAddress.Parse(_settings.FromEmail)); + message.To.Add(MailboxAddress.Parse(to)); + message.Subject = subject; + message.Body = new TextPart("plain") { Text = body }; + + using var client = new SmtpClient(); + var secureSocket = _settings.UseStartTls ? SecureSocketOptions.StartTls : SecureSocketOptions.Auto; + await client.ConnectAsync(_settings.Host, _settings.Port, secureSocket, ct); + if (!string.IsNullOrWhiteSpace(_settings.Username)) + { + await client.AuthenticateAsync(_settings.Username, _settings.Password, ct); + } + await client.SendAsync(message, ct); + await client.DisconnectAsync(true, ct); + } +} diff --git a/cv-matcher-api/Services/HashHelper.cs b/cv-matcher-api/Services/HashHelper.cs new file mode 100644 index 0000000..4081528 --- /dev/null +++ b/cv-matcher-api/Services/HashHelper.cs @@ -0,0 +1,13 @@ +using System.Security.Cryptography; +using System.Text; + +namespace Api.Services; + +public static class HashHelper +{ + public static string Compute(string value) + { + using var sha = SHA256.Create(); + return Convert.ToHexString(sha.ComputeHash(Encoding.UTF8.GetBytes(value ?? string.Empty))); + } +} diff --git a/api/Services/Rag/JobTextExtractor.cs b/cv-matcher-api/Services/JobTextExtractor.cs similarity index 62% rename from api/Services/Rag/JobTextExtractor.cs rename to cv-matcher-api/Services/JobTextExtractor.cs index 719c06d..6d63616 100644 --- a/api/Services/Rag/JobTextExtractor.cs +++ b/cv-matcher-api/Services/JobTextExtractor.cs @@ -1,21 +1,22 @@ using System.Net; using System.Text.RegularExpressions; -using Api.Services.Contracts.Rag; +using Api.Services.Contracts; using Api.Settings; using Microsoft.Extensions.Options; -namespace Api.Services.Rag; +namespace Api.Services; + public sealed class JobTextExtractor : IJobTextExtractor { - private readonly HttpClient _httpClient; - private readonly RagSettings _settings; + private readonly HttpClient _http; + private readonly MatcherSettings _settings; - public JobTextExtractor(HttpClient httpClient, IOptions options) + public JobTextExtractor(HttpClient http, IOptions options) { - _httpClient = httpClient; + _http = http; _settings = options.Value; - _httpClient.Timeout = TimeSpan.FromSeconds(20); - _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0"); + _http.Timeout = TimeSpan.FromSeconds(25); + _http.DefaultRequestHeaders.UserAgent.ParseAdd("MyAi.ro CV Matcher/1.0"); } public async Task ExtractAsync(string? jobUrl, string? jobDescription, CancellationToken ct) @@ -24,17 +25,16 @@ public sealed class JobTextExtractor : IJobTextExtractor if (!string.IsNullOrWhiteSpace(pasted)) return Limit(pasted); if (string.IsNullOrWhiteSpace(jobUrl)) return string.Empty; - if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || (uri.Scheme != "http" && uri.Scheme != "https")) + if (!Uri.TryCreate(jobUrl, UriKind.Absolute, out var uri) || uri.Scheme is not ("http" or "https")) { throw new InvalidOperationException("Invalid job URL."); } - var html = await _httpClient.GetStringAsync(uri, ct); + var html = await _http.GetStringAsync(uri, ct); html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase); html = Regex.Replace(html, "", " ", RegexOptions.IgnoreCase); html = Regex.Replace(html, "<[^>]+>", " "); - var text = WebUtility.HtmlDecode(html); - return Limit(Normalize(text)); + return Limit(Normalize(WebUtility.HtmlDecode(html))); } private string Limit(string value) @@ -46,7 +46,6 @@ public sealed class JobTextExtractor : IJobTextExtractor private static string Normalize(string value) { if (string.IsNullOrWhiteSpace(value)) return string.Empty; - var parts = value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries); - return string.Join(' ', parts).Trim(); + return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim(); } } diff --git a/cv-matcher-api/Services/MatcherAiClient.cs b/cv-matcher-api/Services/MatcherAiClient.cs new file mode 100644 index 0000000..2acdb09 --- /dev/null +++ b/cv-matcher-api/Services/MatcherAiClient.cs @@ -0,0 +1,95 @@ +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; +using System.Text.Json.Serialization; +using Api.Services.Contracts; +using Api.Settings; +using Microsoft.Extensions.Options; + +namespace Api.Services; + +public sealed class MatcherAiClient : IMatcherAiClient +{ + private readonly HttpClient _http; + private readonly IMatcherRepository _repository; + private readonly AiSettings _settings; + private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web) + { + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull + }; + + public MatcherAiClient(HttpClient http, IMatcherRepository repository, IOptions options) + { + _http = http; + _repository = repository; + _settings = options.Value; + } + + public async Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + var model = GetModel(); + var cacheKey = HashHelper.Compute($"chat:{_settings.Provider}:{model}:{temperature:0.00}:{systemPrompt}:{userPrompt}"); + var cached = await _repository.GetChatCompletionAsync(cacheKey, ct); + if (cached is not null) return cached; + + var response = IsOllama() + ? await CreateOllamaChatCompletionAsync(systemPrompt, userPrompt, temperature, ct) + : await CreateOpenAiChatCompletionAsync(systemPrompt, userPrompt, temperature, ct); + + await _repository.SaveChatCompletionAsync(cacheKey, model, temperature, response, ct); + return response; + } + + private bool IsOllama() => string.Equals(_settings.Provider, "Ollama", StringComparison.OrdinalIgnoreCase); + private string GetModel() => IsOllama() ? _settings.Ollama.ChatModel : _settings.OpenAI.ChatModel; + + private async Task CreateOpenAiChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + if (string.IsNullOrWhiteSpace(_settings.OpenAI.ApiKey)) throw new InvalidOperationException("OpenAI API key is missing."); + using var request = new HttpRequestMessage(HttpMethod.Post, "https://api.openai.com/v1/chat/completions"); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", _settings.OpenAI.ApiKey); + request.Content = ToJson(new + { + model = _settings.OpenAI.ChatModel, + temperature, + response_format = new { type = "json_object" }, + messages = new[] + { + new { role = "system", content = systemPrompt }, + new { role = "user", content = userPrompt } + } + }); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(Math.Max(15, _settings.OpenAI.TimeoutSeconds))); + using var response = await _http.SendAsync(request, cts.Token); + var json = await response.Content.ReadAsStringAsync(cts.Token); + if (!response.IsSuccessStatusCode) throw new InvalidOperationException($"OpenAI chat failed: {(int)response.StatusCode} {json}"); + using var doc = JsonDocument.Parse(json); + return doc.RootElement.GetProperty("choices")[0].GetProperty("message").GetProperty("content").GetString() ?? "{}"; + } + + private async Task CreateOllamaChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + var baseUrl = _settings.Ollama.BaseUrl.TrimEnd('/'); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(Math.Max(30, _settings.Ollama.TimeoutSeconds))); + using var response = await _http.PostAsync($"{baseUrl}/api/chat", ToJson(new + { + model = _settings.Ollama.ChatModel, + stream = false, + format = "json", + messages = new[] + { + new { role = "system", content = systemPrompt }, + new { role = "user", content = userPrompt } + }, + options = new { temperature = (float)temperature } + }), cts.Token); + var json = await response.Content.ReadAsStringAsync(cts.Token); + if (!response.IsSuccessStatusCode) throw new InvalidOperationException($"Ollama chat failed: {(int)response.StatusCode} {json}"); + using var doc = JsonDocument.Parse(json); + return doc.RootElement.GetProperty("message").GetProperty("content").GetString() ?? "{}"; + } + + private static StringContent ToJson(T payload) => new(JsonSerializer.Serialize(payload, JsonOptions), Encoding.UTF8, "application/json"); +} diff --git a/cv-matcher-api/Services/RagApiClient.cs b/cv-matcher-api/Services/RagApiClient.cs new file mode 100644 index 0000000..b63a228 --- /dev/null +++ b/cv-matcher-api/Services/RagApiClient.cs @@ -0,0 +1,80 @@ +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; +using Api.Requests; +using Api.Responses; +using Api.Services.Contracts; +using Api.Settings; +using Microsoft.Extensions.Options; + +namespace Api.Services; + +public sealed class RagApiClient : IRagApiClient +{ + private readonly HttpClient _http; + private readonly RagApiSettings _settings; + private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web); + + public RagApiClient(HttpClient http, IOptions options) + { + _http = http; + _settings = options.Value; + _http.BaseAddress = new Uri(_settings.BaseUrl.TrimEnd('/') + "/"); + if (!string.IsNullOrWhiteSpace(_settings.InternalApiKey)) + { + _http.DefaultRequestHeaders.Add("X-Internal-Api-Key", _settings.InternalApiKey); + } + } + + public async Task IndexCvPdfAsync(IFormFile file, CancellationToken ct) + { + using var content = new MultipartFormDataContent(); + await using var stream = file.OpenReadStream(); + using var fileContent = new StreamContent(stream); + fileContent.Headers.ContentType = new MediaTypeHeaderValue("application/pdf"); + content.Add(fileContent, "file", file.FileName); + content.Add(new StringContent("cv"), "documentType"); + content.Add(new StringContent(file.FileName), "title"); + using var response = await _http.PostAsync("api/rag/documents", content, ct); + return await ReadJsonAsync(response, ct); + } + + public async Task IndexJobTextAsync(string text, string? url, string? title, CancellationToken ct) + { + using var content = new MultipartFormDataContent + { + { new StringContent(text), "text" }, + { new StringContent("job"), "documentType" }, + { new StringContent(title ?? "Job description"), "title" } + }; + if (!string.IsNullOrWhiteSpace(url)) content.Add(new StringContent(url), "sourceUrl"); + using var response = await _http.PostAsync("api/rag/documents", content, ct); + return await ReadJsonAsync(response, ct); + } + + public async Task GetDocumentAsync(string documentId, CancellationToken ct) + { + using var response = await _http.GetAsync($"api/rag/documents/{Uri.EscapeDataString(documentId)}", ct); + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) return null; + return await ReadJsonAsync(response, ct); + } + + public async Task SearchAsync(RagSearchRequest request, CancellationToken ct) + { + using var response = await _http.PostAsync( + "api/rag/search", + new StringContent(JsonSerializer.Serialize(request, JsonOptions), Encoding.UTF8, "application/json"), + ct); + return await ReadJsonAsync(response, ct); + } + + private static async Task ReadJsonAsync(HttpResponseMessage response, CancellationToken ct) + { + var json = await response.Content.ReadAsStringAsync(ct); + if (!response.IsSuccessStatusCode) + { + throw new InvalidOperationException($"RAG API failed: {(int)response.StatusCode} {json}"); + } + return JsonSerializer.Deserialize(json, JsonOptions) ?? throw new InvalidOperationException("RAG API returned invalid JSON."); + } +} diff --git a/cv-matcher-api/Services/SqlMatcherRepository.cs b/cv-matcher-api/Services/SqlMatcherRepository.cs new file mode 100644 index 0000000..6afba3c --- /dev/null +++ b/cv-matcher-api/Services/SqlMatcherRepository.cs @@ -0,0 +1,105 @@ +using System.Text.Json; +using Api.Responses; +using Api.Services.Contracts; +using Microsoft.Data.SqlClient; + +namespace Api.Services; + +public sealed class SqlMatcherRepository : IMatcherRepository +{ + private readonly string _connectionString; + + public SqlMatcherRepository(IConfiguration configuration) + { + _connectionString = configuration.GetConnectionString("CvMatcherDb") + ?? throw new InvalidOperationException("Connection string 'CvMatcherDb' is missing."); + } + + public async Task InitializeAsync(CancellationToken ct) + { + await EnsureDatabaseExistsAsync(ct); + var sql = await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, "Database", "schema.sql"), ct); + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + foreach (var commandText in sql.Split("GO", StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) + { + await using var command = new SqlCommand(commandText, connection); + await command.ExecuteNonQueryAsync(ct); + } + } + + public async Task GetMatchAsync(string cvDocumentId, string jobDocumentId, CancellationToken ct) + { + const string sql = "SELECT ResultJson FROM CvMatchResults WHERE CvDocumentId = @CvDocumentId AND JobDocumentId = @JobDocumentId"; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CvDocumentId", cvDocumentId); + command.Parameters.AddWithValue("@JobDocumentId", jobDocumentId); + var json = await command.ExecuteScalarAsync(ct) as string; + if (string.IsNullOrWhiteSpace(json)) return null; + var result = JsonSerializer.Deserialize(json, new JsonSerializerOptions(JsonSerializerDefaults.Web)); + if (result is not null) result.Cached = true; + return result; + } + + public async Task SaveMatchAsync(string cvDocumentId, string jobDocumentId, JobMatchResponse response, CancellationToken ct) + { + const string sql = """ + IF NOT EXISTS (SELECT 1 FROM CvMatchResults WHERE CvDocumentId = @CvDocumentId AND JobDocumentId = @JobDocumentId) + INSERT INTO CvMatchResults (Id, CvDocumentId, JobDocumentId, ResultJson, Score, CreatedAt) + VALUES (@Id, @CvDocumentId, @JobDocumentId, @ResultJson, @Score, SYSUTCDATETIME()) + """; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@Id", Guid.NewGuid().ToString("N")); + command.Parameters.AddWithValue("@CvDocumentId", cvDocumentId); + command.Parameters.AddWithValue("@JobDocumentId", jobDocumentId); + command.Parameters.AddWithValue("@ResultJson", JsonSerializer.Serialize(response, new JsonSerializerOptions(JsonSerializerDefaults.Web))); + command.Parameters.AddWithValue("@Score", response.Score); + await command.ExecuteNonQueryAsync(ct); + } + + public async Task GetChatCompletionAsync(string cacheKey, CancellationToken ct) + { + const string sql = "SELECT ResponseText FROM CvMatcherChatCache WHERE CacheKey = @CacheKey"; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CacheKey", cacheKey); + return await command.ExecuteScalarAsync(ct) as string; + } + + public async Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct) + { + const string sql = """ + IF NOT EXISTS (SELECT 1 FROM CvMatcherChatCache WHERE CacheKey = @CacheKey) + INSERT INTO CvMatcherChatCache (CacheKey, Model, Temperature, ResponseText, CreatedAt) + VALUES (@CacheKey, @Model, @Temperature, @ResponseText, SYSUTCDATETIME()) + """; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CacheKey", cacheKey); + command.Parameters.AddWithValue("@Model", model); + command.Parameters.AddWithValue("@Temperature", temperature); + command.Parameters.AddWithValue("@ResponseText", responseText); + await command.ExecuteNonQueryAsync(ct); + } + private async Task EnsureDatabaseExistsAsync(CancellationToken ct) + { + var builder = new SqlConnectionStringBuilder(_connectionString); + var databaseName = builder.InitialCatalog; + if (string.IsNullOrWhiteSpace(databaseName)) return; + + builder.InitialCatalog = "master"; + await using var connection = new SqlConnection(builder.ConnectionString); + await connection.OpenAsync(ct); + var safeName = databaseName.Replace("]", "]]" ); + await using var command = new SqlCommand($"IF DB_ID(@DatabaseName) IS NULL EXEC('CREATE DATABASE [{safeName}]')", connection); + command.Parameters.AddWithValue("@DatabaseName", databaseName); + await command.ExecuteNonQueryAsync(ct); + } + +} \ No newline at end of file diff --git a/cv-matcher-api/Settings/Settings.cs b/cv-matcher-api/Settings/Settings.cs new file mode 100644 index 0000000..b646204 --- /dev/null +++ b/cv-matcher-api/Settings/Settings.cs @@ -0,0 +1,52 @@ +namespace Api.Settings; + +public sealed class RagApiSettings +{ + public string BaseUrl { get; set; } = "http://localhost:8081"; + public string InternalApiKey { get; set; } = string.Empty; +} + +public sealed class InternalApiSettings +{ + public string ApiKey { get; set; } = string.Empty; + public bool RequireApiKey { get; set; } = false; +} + +public sealed class AiSettings +{ + public string Provider { get; set; } = "OpenAI"; + public OpenAiSettings OpenAI { get; set; } = new(); + public OllamaSettings Ollama { get; set; } = new(); +} + +public sealed class OpenAiSettings +{ + public string ApiKey { get; set; } = string.Empty; + public string ChatModel { get; set; } = "gpt-4o-mini"; + public int TimeoutSeconds { get; set; } = 90; +} + +public sealed class OllamaSettings +{ + public string BaseUrl { get; set; } = "http://localhost:11434"; + public string ChatModel { get; set; } = "llama3.1:8b"; + public int TimeoutSeconds { get; set; } = 180; +} + +public sealed class MatcherSettings +{ + public int TopK { get; set; } = 10; + public int DeepScoreTopN { get; set; } = 5; + public int MaxJobTextChars { get; set; } = 60000; +} + +public sealed class SmtpSettings +{ + public string Host { get; set; } = string.Empty; + public int Port { get; set; } = 587; + public string Username { get; set; } = string.Empty; + public string Password { get; set; } = string.Empty; + public bool UseStartTls { get; set; } = true; + public string FromEmail { get; set; } = "noreply@myai.ro"; + public string ToEmail { get; set; } = string.Empty; +} diff --git a/cv-matcher-api/appsettings.json b/cv-matcher-api/appsettings.json new file mode 100644 index 0000000..a7765a2 --- /dev/null +++ b/cv-matcher-api/appsettings.json @@ -0,0 +1,114 @@ +{ + "Serilog": { + "Using": [ + "Serilog.Sinks.Console", + "Serilog.Sinks.File", + "Serilog.Sinks.Email" + ], + "MinimumLevel": { + "Default": "Information", + "Override": { + "Microsoft.AspNetCore": "Warning", + "Microsoft.AspNetCore.Hosting": "Information", + "Microsoft.AspNetCore.Routing": "Warning", + "System.Net.Http.HttpClient": "Warning", + "Api": "Information" + } + }, + "WriteTo": [ + { + "Name": "Console", + "Args": { + "outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}" + } + }, + { + "Name": "File", + "Args": { + "path": "logs/api-.log", + "rollingInterval": "Day", + "retainedFileCountLimit": 30, + "outputTemplate": "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {SourceContext}: {Message:lj}{NewLine}{Exception}" + } + }, + { + "Name": "Email", + "Args": { + "restrictedToMinimumLevel": "Error", + "fromEmail": "", + "toEmail": "", + "mailServer": "", + "networkCredential": { + "userName": "", + "password": "" + }, + "port": 587, + "enableSsl": true, + "emailSubject": "[mihes.ro API] Error Alert", + "outputTemplate": "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {SourceContext}{NewLine}{Message:lj}{NewLine}{Exception}", + "batchPostingLimit": 10, + "period": "0.00:05:00" + } + } + ], + "Enrich": [ + "FromLogContext", + "WithMachineName", + "WithEnvironmentName" + ] + }, + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "Microsoft.AspNetCore.Hosting": "Information", + "Microsoft.AspNetCore.Routing": "Warning", + "System.Net.Http.HttpClient": "Warning", + "Api": "Information" + }, + "LogEnvironmentOnStartup": true + }, + "AllowedHosts": "*", + "KeyVault": { + "VaultUri": "", + "Enabled": false + }, + "ConnectionStrings": { + "CvMatcherDb": "Server=localhost,1433;Database=MyAiCvMatcher;User Id=sa;Password=Your_strong_password123;TrustServerCertificate=True" + }, + "InternalApi": { + "ApiKey": "", + "RequireApiKey": false + }, + "RagApi": { + "BaseUrl": "http://localhost:8081", + "InternalApiKey": "" + }, + "Ai": { + "Provider": "OpenAI", + "OpenAI": { + "ApiKey": "", + "ChatModel": "gpt-4o-mini", + "TimeoutSeconds": 90 + }, + "Ollama": { + "BaseUrl": "http://localhost:11434", + "ChatModel": "llama3.1:8b", + "TimeoutSeconds": 180 + } + }, + "Matcher": { + "TopK": 10, + "DeepScoreTopN": 5, + "MaxJobTextChars": 60000 + }, + "Smtp": { + "Host": "", + "Port": 587, + "Username": "", + "Password": "", + "UseStartTls": true, + "FromEmail": "noreply@myai.ro", + "ToEmail": "" + } +} diff --git a/cv-matcher-api/cv-matcher-api.csproj b/cv-matcher-api/cv-matcher-api.csproj new file mode 100644 index 0000000..2c70231 --- /dev/null +++ b/cv-matcher-api/cv-matcher-api.csproj @@ -0,0 +1,26 @@ + + + net10.0 + enable + enable + Linux + Api + + + + + + + + + + + + + + + + PreserveNewest + + + diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 20c8a8a..4daae6f 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,7 +1,74 @@ version: "3.8" services: + + mssql: + image: mcr.microsoft.com/mssql/server:2022-latest + container_name: myai-mssql + environment: + - ACCEPT_EULA=Y + - MSSQL_SA_PASSWORD=${MSSQL_SA_PASSWORD:-Your_strong_password123} + ports: + - "1433:1433" + volumes: + - myai-mssql-data:/var/opt/mssql + networks: + - myai-network + restart: unless-stopped + + rag-api: + build: + context: ../rag-api + dockerfile: Dockerfile + container_name: myai-rag-api + depends_on: + - mssql + ports: + - "8081:8080" + env_file: + - .env + environment: + - ASPNETCORE_ENVIRONMENT=${ASPNETCORE_ENVIRONMENT:-Development} + - ASPNETCORE_URLS=http://+:8080 + - ConnectionStrings__RagDb=Server=mssql,1433;Database=MyAiRag;User Id=sa;Password=${MSSQL_SA_PASSWORD:-Your_strong_password123};TrustServerCertificate=True + - InternalApi__RequireApiKey=true + - InternalApi__ApiKey=${INTERNAL_API_KEY:-change-this-internal-key} + - Ai__Provider=${AI_PROVIDER:-OpenAI} + - Ai__OpenAI__ApiKey=${OPENAI_API_KEY:-} + - Ai__Ollama__BaseUrl=${OLLAMA_BASE_URL:-http://host.docker.internal:11434} + networks: + - myai-network + restart: unless-stopped + + cv-matcher-api: + build: + context: ../cv-matcher-api + dockerfile: Dockerfile + container_name: myai-cv-matcher-api + depends_on: + - mssql + - rag-api + ports: + - "8082:8080" + env_file: + - .env + environment: + - ASPNETCORE_ENVIRONMENT=${ASPNETCORE_ENVIRONMENT:-Development} + - ASPNETCORE_URLS=http://+:8080 + - ConnectionStrings__CvMatcherDb=Server=mssql,1433;Database=MyAiCvMatcher;User Id=sa;Password=${MSSQL_SA_PASSWORD:-Your_strong_password123};TrustServerCertificate=True + - InternalApi__RequireApiKey=true + - InternalApi__ApiKey=${INTERNAL_API_KEY:-change-this-internal-key} + - RagApi__BaseUrl=http://rag-api:8080 + - RagApi__InternalApiKey=${INTERNAL_API_KEY:-change-this-internal-key} + - Ai__Provider=${AI_PROVIDER:-OpenAI} + - Ai__OpenAI__ApiKey=${OPENAI_API_KEY:-} + - Ai__Ollama__BaseUrl=${OLLAMA_BASE_URL:-http://host.docker.internal:11434} + networks: + - myai-network + restart: unless-stopped api: + depends_on: + - cv-matcher-api build: context: ../api dockerfile: Dockerfile @@ -16,6 +83,8 @@ services: - ASPNETCORE_URLS=${ASPNETCORE_URLS:-http://+:8080} - Cors__AllowedOrigins__0=http://localhost:5000 - Cors__AllowedOrigins__1=http://web:8080 + - CvMatcherApi__BaseUrl=http://cv-matcher-api:8080 + - CvMatcherApi__InternalApiKey=${INTERNAL_API_KEY:-change-this-internal-key} volumes: - ../api/logs:/app/logs networks: @@ -40,6 +109,9 @@ services: - myai-network restart: unless-stopped +volumes: + myai-mssql-data: + networks: myai-network: driver: bridge \ No newline at end of file diff --git a/myAi.sln b/myAi.sln index 274f960..3fb28a4 100644 --- a/myAi.sln +++ b/myAi.sln @@ -6,6 +6,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "api", "api\api.csproj", "{1 EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "web", "web\web.csproj", "{B0A3EAB7-759A-448A-A906-52DF75A70016}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "rag-api", "rag-api\rag-api.csproj", "{A63E1C1A-4A78-49F4-9F5C-D43783294861}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "cv-matcher-api", "cv-matcher-api\cv-matcher-api.csproj", "{C40F5025-B0A6-4B25-B4A2-7EA568E06C40}" +EndProject Project("{E53339B2-1760-4266-BCC7-CA923CBCF16C}") = "docker-compose", "docker-compose\docker-compose.dcproj", "{81DDED9D-158B-E303-5F62-77A2896D2A5A}" EndProject Global @@ -22,6 +26,14 @@ Global {B0A3EAB7-759A-448A-A906-52DF75A70016}.Debug|Any CPU.Build.0 = Debug|Any CPU {B0A3EAB7-759A-448A-A906-52DF75A70016}.Release|Any CPU.ActiveCfg = Release|Any CPU {B0A3EAB7-759A-448A-A906-52DF75A70016}.Release|Any CPU.Build.0 = Release|Any CPU + {A63E1C1A-4A78-49F4-9F5C-D43783294861}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A63E1C1A-4A78-49F4-9F5C-D43783294861}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A63E1C1A-4A78-49F4-9F5C-D43783294861}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A63E1C1A-4A78-49F4-9F5C-D43783294861}.Release|Any CPU.Build.0 = Release|Any CPU + {C40F5025-B0A6-4B25-B4A2-7EA568E06C40}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C40F5025-B0A6-4B25-B4A2-7EA568E06C40}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C40F5025-B0A6-4B25-B4A2-7EA568E06C40}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C40F5025-B0A6-4B25-B4A2-7EA568E06C40}.Release|Any CPU.Build.0 = Release|Any CPU {81DDED9D-158B-E303-5F62-77A2896D2A5A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {81DDED9D-158B-E303-5F62-77A2896D2A5A}.Debug|Any CPU.Build.0 = Debug|Any CPU {81DDED9D-158B-E303-5F62-77A2896D2A5A}.Release|Any CPU.ActiveCfg = Release|Any CPU diff --git a/rag-api/Controllers/RagController.cs b/rag-api/Controllers/RagController.cs new file mode 100644 index 0000000..9d6796d --- /dev/null +++ b/rag-api/Controllers/RagController.cs @@ -0,0 +1,110 @@ +using Microsoft.AspNetCore.Mvc; +using Api.Services.Contracts; +using Api.Requests; + +namespace Api.Controllers; + +[ApiController] +[Route("api/rag")] +public sealed class RagController : ControllerBase +{ + private readonly IRagService _ragService; + private readonly ILogger _logger; + + public RagController(IRagService ragService, ILogger logger) + { + _ragService = ragService; + _logger = logger; + } + + [HttpPost("documents")] + [RequestSizeLimit(10 * 1024 * 1024)] + public async Task IndexDocument( + [FromForm] IFormFile? file, + [FromForm] string? text, + [FromForm] string? documentType, + [FromForm] string? title, + [FromForm] string? sourceUrl, + CancellationToken ct) + { + try + { + _logger.LogInformation("Index document request received. HasFile={HasFile}, DocumentType={DocumentType}, Title={Title}, SourceUrl={SourceUrl}", + file is not null, documentType, title, sourceUrl); + + if (file is not null) + { + var result = await _ragService.IndexPdfAsync(file, documentType, title, sourceUrl, ct); + _logger.LogInformation("Indexed PDF document. DocumentId={DocumentId}, DocumentType={DocumentType}, Chunks={Chunks}, Cached={Cached}", + result.DocumentId, result.DocumentType, result.Chunks, result.Cached); + return Ok(result); + } + + var textResult = await _ragService.IndexTextAsync(new IndexDocumentRequest + { + Text = text, + DocumentType = documentType, + Title = title, + SourceUrl = sourceUrl + }, ct); + _logger.LogInformation("Indexed text document. DocumentId={DocumentId}, DocumentType={DocumentType}, Chunks={Chunks}, Cached={Cached}", + textResult.DocumentId, textResult.DocumentType, textResult.Chunks, textResult.Cached); + return Ok(textResult); + } + catch (InvalidOperationException ex) + { + _logger.LogWarning(ex, "Invalid document indexing request."); + return BadRequest(new { error = ex.Message }); + } + } + + [HttpPost("documents/json")] + public async Task IndexJsonDocument([FromBody] IndexDocumentRequest request, CancellationToken ct) + { + try + { + _logger.LogInformation("JSON document indexing request received. DocumentType={DocumentType}, Title={Title}, SourceUrl={SourceUrl}", + request.DocumentType, request.Title, request.SourceUrl); + var result = await _ragService.IndexTextAsync(request, ct); + _logger.LogInformation("Indexed JSON document. DocumentId={DocumentId}, DocumentType={DocumentType}, Chunks={Chunks}, Cached={Cached}", + result.DocumentId, result.DocumentType, result.Chunks, result.Cached); + return Ok(result); + } + catch (InvalidOperationException ex) + { + _logger.LogWarning(ex, "Invalid JSON document indexing request."); + return BadRequest(new { error = ex.Message }); + } + } + + [HttpPost("search")] + public async Task Search([FromBody] SearchRequest request, CancellationToken ct) + { + try + { + _logger.LogInformation("Semantic search request received. TargetTypes={TargetTypes}, TopK={TopK}", + string.Join(',', request.TargetDocumentTypes ?? []), request.TopK); + var result = await _ragService.SearchAsync(request, ct); + _logger.LogInformation("Semantic search completed. ResultCount={ResultCount}", result.Results.Count); + return Ok(result); + } + catch (InvalidOperationException ex) + { + _logger.LogWarning(ex, "Invalid semantic search request."); + return BadRequest(new { error = ex.Message }); + } + } + + [HttpGet("documents/{id}")] + public async Task GetDocument(string id, CancellationToken ct) + { + _logger.LogInformation("Get document request received. DocumentId={DocumentId}", id); + var document = await _ragService.GetDocumentAsync(id, ct); + if (document is null) + { + _logger.LogWarning("Document not found. DocumentId={DocumentId}", id); + return NotFound(new { error = "Document not found." }); + } + return Ok(document); + } +} diff --git a/rag-api/Database/schema.sql b/rag-api/Database/schema.sql new file mode 100644 index 0000000..53d5d16 --- /dev/null +++ b/rag-api/Database/schema.sql @@ -0,0 +1,63 @@ +IF OBJECT_ID('dbo.RagChunks', 'U') IS NULL +BEGIN + CREATE TABLE dbo.RagChunks ( + Id NVARCHAR(64) NOT NULL CONSTRAINT PK_RagChunks PRIMARY KEY, + DocumentId NVARCHAR(64) NOT NULL, + ChunkIndex INT NOT NULL, + Text NVARCHAR(MAX) NOT NULL, + Embedding VARBINARY(MAX) NOT NULL + ); +END +GO + +IF OBJECT_ID('dbo.RagDocuments', 'U') IS NULL +BEGIN + CREATE TABLE dbo.RagDocuments ( + Id NVARCHAR(64) NOT NULL CONSTRAINT PK_RagDocuments PRIMARY KEY, + DocumentType NVARCHAR(80) NOT NULL, + Title NVARCHAR(300) NOT NULL, + SourceUrl NVARCHAR(1200) NULL, + RawText NVARCHAR(MAX) NOT NULL, + TextHash NVARCHAR(64) NOT NULL, + TypeConfidence FLOAT NOT NULL, + MetadataJson NVARCHAR(MAX) NOT NULL CONSTRAINT DF_RagDocuments_MetadataJson DEFAULT '{}', + CreatedAt DATETIME2 NOT NULL CONSTRAINT DF_RagDocuments_CreatedAt DEFAULT SYSUTCDATETIME() + ); + + CREATE INDEX IX_RagDocuments_TextHash ON dbo.RagDocuments(TextHash); + CREATE INDEX IX_RagDocuments_DocumentType ON dbo.RagDocuments(DocumentType); +END +GO + +IF NOT EXISTS (SELECT 1 FROM sys.foreign_keys WHERE name = 'FK_RagChunks_RagDocuments') +BEGIN + ALTER TABLE dbo.RagChunks + ADD CONSTRAINT FK_RagChunks_RagDocuments FOREIGN KEY (DocumentId) REFERENCES dbo.RagDocuments(Id) ON DELETE CASCADE; +END +GO + +IF OBJECT_ID('dbo.RagEmbeddingCache', 'U') IS NULL +BEGIN + CREATE TABLE dbo.RagEmbeddingCache ( + CacheKey NVARCHAR(64) NOT NULL CONSTRAINT PK_RagEmbeddingCache PRIMARY KEY, + Model NVARCHAR(120) NOT NULL, + TextHash NVARCHAR(64) NOT NULL, + Vector VARBINARY(MAX) NOT NULL, + CreatedAt DATETIME2 NOT NULL CONSTRAINT DF_RagEmbeddingCache_CreatedAt DEFAULT SYSUTCDATETIME() + ); + + CREATE INDEX IX_RagEmbeddingCache_TextHash ON dbo.RagEmbeddingCache(TextHash); +END +GO + +IF OBJECT_ID('dbo.RagChatCompletionCache', 'U') IS NULL +BEGIN + CREATE TABLE dbo.RagChatCompletionCache ( + CacheKey NVARCHAR(64) NOT NULL CONSTRAINT PK_RagChatCompletionCache PRIMARY KEY, + Model NVARCHAR(120) NOT NULL, + Temperature DECIMAL(4,2) NOT NULL, + ResponseText NVARCHAR(MAX) NOT NULL, + CreatedAt DATETIME2 NOT NULL CONSTRAINT DF_RagChatCompletionCache_CreatedAt DEFAULT SYSUTCDATETIME() + ); +END +GO diff --git a/rag-api/Dockerfile b/rag-api/Dockerfile new file mode 100644 index 0000000..f1b69c0 --- /dev/null +++ b/rag-api/Dockerfile @@ -0,0 +1,15 @@ +FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS base +WORKDIR /app +EXPOSE 8080 + +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build +WORKDIR /src +COPY ["rag-api.csproj", "./"] +RUN dotnet restore "rag-api.csproj" +COPY . . +RUN dotnet publish "rag-api.csproj" -c Release -o /app/publish /p:UseAppHost=false + +FROM base AS final +WORKDIR /app +COPY --from=build /app/publish . +ENTRYPOINT ["dotnet", "rag-api.dll"] diff --git a/rag-api/Program.cs b/rag-api/Program.cs new file mode 100644 index 0000000..dd5a067 --- /dev/null +++ b/rag-api/Program.cs @@ -0,0 +1,282 @@ +using Azure.Identity; +using Microsoft.AspNetCore.Diagnostics; +using Api.Services; +using Api.Services.Contracts; +using Api.Settings; +using Serilog; +using System.Reflection; + +DotNetEnv.Env.Load(); + +try +{ + var builder = WebApplication.CreateBuilder(args); + var appVersion = Assembly.GetExecutingAssembly() + .GetCustomAttribute()? + .InformationalVersion + ?? Assembly.GetExecutingAssembly().GetName().Version?.ToString() + ?? "unknown"; + + builder.Host.UseSerilog((context, services, configuration) => + { + configuration + .ReadFrom.Configuration(context.Configuration) + .ReadFrom.Services(services) + .Enrich.FromLogContext() + .Enrich.WithMachineName() + .Enrich.WithEnvironmentName() + .Enrich.WithProperty("Service", "rag-api") + .Enrich.WithProperty("AppVersion", appVersion) + .WriteTo.Console(new Serilog.Formatting.Json.JsonFormatter()); + }); + + Log.Information("Starting {Service} version {AppVersion}", "rag-api", appVersion); + + // -------------------- + // Azure Key Vault Configuration + // -------------------- + var keyVaultUri = builder.Configuration["KeyVault:VaultUri"]; + var keyVaultEnabled = builder.Configuration.GetValue("KeyVault:Enabled"); + + if (keyVaultEnabled && !string.IsNullOrWhiteSpace(keyVaultUri)) + { + Log.Information("Loading configuration from Azure Key Vault: {VaultUri}", keyVaultUri); + + try + { + builder.Configuration.AddAzureKeyVault( + new Uri(keyVaultUri), + new DefaultAzureCredential()); + + Log.Information("Azure Key Vault configuration loaded successfully"); + } + catch (Exception ex) + { + Log.Warning(ex, "Failed to load Azure Key Vault configuration. Continuing with other configuration sources."); + } + } + else + { + Log.Information("Azure Key Vault is disabled or not configured"); + } + + builder.Services.Configure(builder.Configuration.GetSection("Rag")); + builder.Services.Configure(builder.Configuration.GetSection("Ai")); + builder.Services.Configure(builder.Configuration.GetSection("InternalApi")); + + builder.Services.AddHttpClient(); + builder.Services.AddSingleton(); + builder.Services.AddScoped(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddScoped(); + + builder.Services.AddControllers(); + builder.Services.AddEndpointsApiExplorer(); + builder.Services.AddSwaggerGen(); + + var app = builder.Build(); + + var logger = app.Services.GetRequiredService>(); + logger.LogInformation("API starting up..."); + logger.LogInformation("Environment: {Environment}", app.Environment.EnvironmentName); + + // Log all environment variables and configuration settings at startup + // Can be controlled via appsettings: "Logging:LogEnvironmentOnStartup": true + var logEnvironmentOnStartup = app.Configuration.GetValue("Logging:LogEnvironmentOnStartup", defaultValue: true); + if (logEnvironmentOnStartup) + { + LogEnvironmentSettings(logger, app.Configuration, app.Environment); + } + + using (var scope = app.Services.CreateScope()) + { + var repository = scope.ServiceProvider.GetRequiredService(); + await repository.InitializeAsync(CancellationToken.None); + } + + app.UseSerilogRequestLogging(options => + { + options.MessageTemplate = "HTTP {RequestMethod} {RequestPath} responded {StatusCode} in {Elapsed:0.0000} ms"; + options.EnrichDiagnosticContext = (diagnosticContext, httpContext) => + { + diagnosticContext.Set("RequestHost", httpContext.Request.Host.Value); + diagnosticContext.Set("RequestScheme", httpContext.Request.Scheme); + diagnosticContext.Set("RemoteIP", httpContext.Connection.RemoteIpAddress?.ToString()); + diagnosticContext.Set("UserAgent", httpContext.Request.Headers.UserAgent.ToString()); + }; + }); + + app.UseExceptionHandler(errorApp => + { + errorApp.Run(async context => + { + var feature = context.Features.Get(); + var logger = context.RequestServices.GetRequiredService>(); + if (feature?.Error is not null) + { + logger.LogError(feature.Error, "Unhandled exception in {Service}", "rag-api"); + } + + context.Response.StatusCode = StatusCodes.Status500InternalServerError; + context.Response.ContentType = "application/json"; + await context.Response.WriteAsJsonAsync(new { error = "Unexpected server error." }); + }); + }); + + app.Use(async (context, next) => + { + var settings = context.RequestServices.GetRequiredService>().Value; + if (settings.RequireApiKey) + { + var header = context.Request.Headers["X-Internal-Api-Key"].ToString(); + if (string.IsNullOrWhiteSpace(settings.ApiKey) || header != settings.ApiKey) + { + var logger = context.RequestServices.GetRequiredService>(); + logger.LogWarning("Rejected unauthorized internal API call. Path={Path}, RemoteIP={RemoteIP}", context.Request.Path, context.Connection.RemoteIpAddress?.ToString()); + context.Response.StatusCode = StatusCodes.Status401Unauthorized; + await context.Response.WriteAsJsonAsync(new { error = "Unauthorized internal API call." }); + return; + } + } + + await next(); + }); + + // Swagger (typically only in Development) + if (app.Environment.IsDevelopment()) + { + app.UseSwagger(); + app.UseSwaggerUI(options => + { + options.DocumentTitle = "rag-api"; + options.SwaggerEndpoint("/swagger/v1/swagger.json", "rag-api v1"); + options.RoutePrefix = "swagger"; + }); + } + + app.MapControllers(); + app.MapGet("/health", () => Results.Ok(new { status = "ok", service = "rag-api", version = appVersion, timeUtc = DateTimeOffset.UtcNow })); + + Log.Information("{Service} startup complete", "rag-api"); + app.Run(); +} +catch (Exception ex) +{ + Log.Fatal(ex, "rag-api terminated unexpectedly"); +} +finally +{ + Log.Information("Shutting down rag-api"); + Log.CloseAndFlush(); +} + +/// +/// Logs all environment variables and configuration settings at startup for diagnostics. +/// +static void LogEnvironmentSettings(Microsoft.Extensions.Logging.ILogger logger, IConfiguration configuration, IWebHostEnvironment environment) +{ + logger.LogInformation("==================== ENVIRONMENT SETTINGS ===================="); + + // Environment Information + logger.LogInformation("Application Name: {ApplicationName}", environment.ApplicationName); + logger.LogInformation("Environment Name: {EnvironmentName}", environment.EnvironmentName); + logger.LogInformation("Content Root Path: {ContentRootPath}", environment.ContentRootPath); + logger.LogInformation("Web Root Path: {WebRootPath}", environment.WebRootPath); + + // Environment Variables + logger.LogInformation("-------------- Environment Variables --------------"); + var envVars = Environment.GetEnvironmentVariables(); + var sortedEnvVars = new SortedDictionary(); + + foreach (System.Collections.DictionaryEntry entry in envVars) + { + var key = entry.Key?.ToString() ?? string.Empty; + var value = entry.Value?.ToString() ?? string.Empty; + + // Mask sensitive values (passwords, secrets, tokens, keys) but show last 4 characters + if (IsSensitiveKey(key)) + { + value = MaskValueWithLastChars(value); + } + + sortedEnvVars[key] = value; + } + + foreach (var kvp in sortedEnvVars) + { + logger.LogInformation(" {Key} = {Value}", kvp.Key, kvp.Value); + } + + // Configuration Settings + logger.LogInformation("-------------- Configuration Settings --------------"); + LogConfigurationRecursive(logger, configuration.GetChildren(), ""); + + logger.LogInformation("==========================================================="); +} + +/// +/// Recursively logs configuration settings with hierarchy. +/// +static void LogConfigurationRecursive(Microsoft.Extensions.Logging.ILogger logger, IEnumerable sections, string prefix) +{ + foreach (var section in sections) + { + var key = string.IsNullOrEmpty(prefix) ? section.Key : $"{prefix}:{section.Key}"; + + if (section.Value != null) + { + var value = section.Value; + + // Mask sensitive configuration values but show last 4 characters + if (IsSensitiveKey(key)) + { + value = MaskValueWithLastChars(value); + } + + logger.LogInformation(" {Key} = {Value}", key, value); + } + + // Recurse into child sections + if (section.GetChildren().Any()) + { + LogConfigurationRecursive(logger, section.GetChildren(), key); + } + } +} + +/// +/// Checks if a configuration key contains sensitive information. +/// +static bool IsSensitiveKey(string key) +{ + return key.Contains("Password", StringComparison.OrdinalIgnoreCase) || + key.Contains("Secret", StringComparison.OrdinalIgnoreCase) || + key.Contains("Token", StringComparison.OrdinalIgnoreCase) || + key.Contains("Key", StringComparison.OrdinalIgnoreCase) || + key.Contains("ConnectionString", StringComparison.OrdinalIgnoreCase); +} + +/// +/// Masks a sensitive value but shows the last 4 characters for verification. +/// +/// The value to mask. +/// Masked value showing last 4 characters (e.g., "***MASKED***...abcd") +static string MaskValueWithLastChars(string value) +{ + if (string.IsNullOrEmpty(value)) + { + return "***NOT SET***"; + } + + // If value is too short, just mask it completely + if (value.Length <= 4) + { + return "***MASKED***"; + } + + // Show last 4 characters + var lastChars = value.Substring(value.Length - 4); + return $"***MASKED***...{lastChars}"; +} diff --git a/rag-api/Properties/launchSettings.json b/rag-api/Properties/launchSettings.json new file mode 100644 index 0000000..c2926f5 --- /dev/null +++ b/rag-api/Properties/launchSettings.json @@ -0,0 +1,12 @@ +{ + "profiles": { + "rag-api": { + "commandName": "Project", + "launchBrowser": true, + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "applicationUrl": "https://localhost:58424;http://localhost:58426" + } + } +} \ No newline at end of file diff --git a/rag-api/Requests/IndexDocumentRequest.cs b/rag-api/Requests/IndexDocumentRequest.cs new file mode 100644 index 0000000..833b683 --- /dev/null +++ b/rag-api/Requests/IndexDocumentRequest.cs @@ -0,0 +1,11 @@ +namespace Api.Requests +{ + public sealed class IndexDocumentRequest + { + public string? Text { get; set; } + public string? SourceUrl { get; set; } + public string? DocumentType { get; set; } + public string? Title { get; set; } + public Dictionary? Metadata { get; set; } + } +} diff --git a/rag-api/Requests/SearchRequest.cs b/rag-api/Requests/SearchRequest.cs new file mode 100644 index 0000000..f198a1d --- /dev/null +++ b/rag-api/Requests/SearchRequest.cs @@ -0,0 +1,9 @@ +namespace Api.Requests +{ + public sealed class SearchRequest + { + public required string QueryText { get; init; } + public IReadOnlyList? TargetDocumentTypes { get; init; } + public int? TopK { get; init; } + } +} diff --git a/rag-api/Responses/IndexDocumentResponse.cs b/rag-api/Responses/IndexDocumentResponse.cs new file mode 100644 index 0000000..fb918bf --- /dev/null +++ b/rag-api/Responses/IndexDocumentResponse.cs @@ -0,0 +1,14 @@ +namespace Api.Responses +{ + public sealed class IndexDocumentResponse + { + public required string DocumentId { get; init; } + public required string TextHash { get; init; } + public required string DocumentType { get; init; } + public double DocumentTypeConfidence { get; init; } + public required string Title { get; init; } + public int Chunks { get; init; } + public int Characters { get; init; } + public bool Cached { get; init; } + } +} diff --git a/rag-api/Responses/SearchResponse.cs b/rag-api/Responses/SearchResponse.cs new file mode 100644 index 0000000..bf04979 --- /dev/null +++ b/rag-api/Responses/SearchResponse.cs @@ -0,0 +1,25 @@ +namespace Api.Responses +{ + public sealed class SearchResponse + { + public IReadOnlyList Results { get; init; } = []; + } + + public sealed class SearchDocumentResult + { + public required string DocumentId { get; init; } + public required string DocumentType { get; init; } + public required string Title { get; init; } + public string? SourceUrl { get; init; } + public double Score { get; init; } + public IReadOnlyList MatchedChunks { get; init; } = []; + } + + public sealed class SearchChunkResult + { + public required string ChunkId { get; init; } + public int ChunkIndex { get; init; } + public required string Text { get; init; } + public double Score { get; init; } + } +} diff --git a/rag-api/Services/CachedAiClient.cs b/rag-api/Services/CachedAiClient.cs new file mode 100644 index 0000000..08f33b5 --- /dev/null +++ b/rag-api/Services/CachedAiClient.cs @@ -0,0 +1,52 @@ +using Microsoft.Extensions.Options; +using Api.Services.Contracts; +using Api.Settings; + +namespace Api.Services; + +public sealed class CachedAiClient : IAiClient +{ + private readonly RawAiClient _raw; + private readonly IRagRepository _repository; + private readonly AiSettings _settings; + + public CachedAiClient(RawAiClient raw, IRagRepository repository, IOptions options) + { + _raw = raw; + _repository = repository; + _settings = options.Value; + } + + public async Task CreateEmbeddingAsync(string input, CancellationToken ct) + { + var model = GetEmbeddingModel(); + var textHash = HashHelper.Compute(input); + var cacheKey = HashHelper.Compute($"embedding:{_settings.Provider}:{model}:{textHash}"); + var cached = await _repository.GetEmbeddingAsync(cacheKey, ct); + if (cached is not null) return cached; + + var vector = await _raw.CreateEmbeddingAsync(input, ct); + await _repository.SaveEmbeddingAsync(cacheKey, model, textHash, vector, ct); + return vector; + } + + public async Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + var model = GetChatModel(); + var cacheKey = HashHelper.Compute($"chat:{_settings.Provider}:{model}:{temperature:0.00}:{systemPrompt}:{userPrompt}"); + var cached = await _repository.GetChatCompletionAsync(cacheKey, ct); + if (cached is not null) return cached; + + var response = await _raw.CreateChatCompletionAsync(systemPrompt, userPrompt, temperature, ct); + await _repository.SaveChatCompletionAsync(cacheKey, model, temperature, response, ct); + return response; + } + + private string GetEmbeddingModel() => string.Equals(_settings.Provider, "Ollama", StringComparison.OrdinalIgnoreCase) + ? _settings.Ollama.EmbeddingModel + : _settings.OpenAI.EmbeddingModel; + + private string GetChatModel() => string.Equals(_settings.Provider, "Ollama", StringComparison.OrdinalIgnoreCase) + ? _settings.Ollama.ChatModel + : _settings.OpenAI.ChatModel; +} diff --git a/api/Services/Contracts/Rag/IAiRagClient.cs b/rag-api/Services/Contracts/IAiClient.cs similarity index 53% rename from api/Services/Contracts/Rag/IAiRagClient.cs rename to rag-api/Services/Contracts/IAiClient.cs index 8f58ac1..3e5400b 100644 --- a/api/Services/Contracts/Rag/IAiRagClient.cs +++ b/rag-api/Services/Contracts/IAiClient.cs @@ -1,7 +1,7 @@ -namespace Api.Services.Contracts.Rag; +namespace Api.Services.Contracts; -public interface IAiRagClient +public interface IAiClient { Task CreateEmbeddingAsync(string input, CancellationToken ct); - Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, CancellationToken ct); + Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct); } diff --git a/rag-api/Services/Contracts/IDocumentClassifier.cs b/rag-api/Services/Contracts/IDocumentClassifier.cs new file mode 100644 index 0000000..13014ed --- /dev/null +++ b/rag-api/Services/Contracts/IDocumentClassifier.cs @@ -0,0 +1,8 @@ +using Api.Services.Contracts.Models; + +namespace Api.Services.Contracts; + +public interface IDocumentClassifier +{ + Task ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct); +} diff --git a/rag-api/Services/Contracts/IRagRepository.cs b/rag-api/Services/Contracts/IRagRepository.cs new file mode 100644 index 0000000..dd2560e --- /dev/null +++ b/rag-api/Services/Contracts/IRagRepository.cs @@ -0,0 +1,16 @@ +using Api.Services.Contracts.Models; + +namespace Api.Services.Contracts; + +public interface IRagRepository +{ + Task InitializeAsync(CancellationToken ct); + Task GetDocumentByTextHashAsync(string textHash, string? sourceUrl, CancellationToken ct); + Task GetDocumentByIdAsync(string id, CancellationToken ct); + Task SaveDocumentAsync(RagDocumentRecord document, IReadOnlyList chunks, CancellationToken ct); + Task> SearchChunksAsync(float[] queryEmbedding, IReadOnlyList? targetTypes, int topK, CancellationToken ct); + Task GetEmbeddingAsync(string cacheKey, CancellationToken ct); + Task SaveEmbeddingAsync(string cacheKey, string model, string textHash, float[] vector, CancellationToken ct); + Task GetChatCompletionAsync(string cacheKey, CancellationToken ct); + Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct); +} diff --git a/rag-api/Services/Contracts/IRagService.cs b/rag-api/Services/Contracts/IRagService.cs new file mode 100644 index 0000000..b49b910 --- /dev/null +++ b/rag-api/Services/Contracts/IRagService.cs @@ -0,0 +1,13 @@ +using Api.Requests; +using Api.Responses; +using Api.Services.Contracts.Models; + +namespace Api.Services.Contracts; + +public interface IRagService +{ + Task IndexTextAsync(IndexDocumentRequest request, CancellationToken ct); + Task IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct); + Task SearchAsync(SearchRequest request, CancellationToken ct); + Task GetDocumentAsync(string documentId, CancellationToken ct); +} diff --git a/api/Services/Contracts/Rag/ITextChunker.cs b/rag-api/Services/Contracts/ITextChunker.cs similarity index 74% rename from api/Services/Contracts/Rag/ITextChunker.cs rename to rag-api/Services/Contracts/ITextChunker.cs index cd158d4..6c7e660 100644 --- a/api/Services/Contracts/Rag/ITextChunker.cs +++ b/rag-api/Services/Contracts/ITextChunker.cs @@ -1,4 +1,4 @@ -namespace Api.Services.Contracts.Rag; +namespace Api.Services.Contracts; public interface ITextChunker { diff --git a/rag-api/Services/Contracts/ITextExtractor.cs b/rag-api/Services/Contracts/ITextExtractor.cs new file mode 100644 index 0000000..4241474 --- /dev/null +++ b/rag-api/Services/Contracts/ITextExtractor.cs @@ -0,0 +1,7 @@ +namespace Api.Services.Contracts; + +public interface ITextExtractor +{ + Task ExtractPdfAsync(Stream stream, CancellationToken ct); + string Normalize(string value); +} diff --git a/rag-api/Services/Contracts/Models/DocumentClassification.cs b/rag-api/Services/Contracts/Models/DocumentClassification.cs new file mode 100644 index 0000000..9586c0d --- /dev/null +++ b/rag-api/Services/Contracts/Models/DocumentClassification.cs @@ -0,0 +1,10 @@ +namespace Api.Services.Contracts.Models +{ + public sealed class DocumentClassification + { + public required string DocumentType { get; init; } + public double Confidence { get; init; } + public required string Title { get; init; } + public Dictionary Metadata { get; init; } = []; + } +} diff --git a/rag-api/Services/Contracts/Models/RagChunkRecord.cs b/rag-api/Services/Contracts/Models/RagChunkRecord.cs new file mode 100644 index 0000000..6aef944 --- /dev/null +++ b/rag-api/Services/Contracts/Models/RagChunkRecord.cs @@ -0,0 +1,11 @@ +namespace Api.Services.Contracts.Models +{ + public sealed class RagChunkRecord + { + public required string Id { get; init; } + public required string DocumentId { get; init; } + public int ChunkIndex { get; init; } + public required string Text { get; init; } + public required float[] Embedding { get; init; } + } +} diff --git a/rag-api/Services/Contracts/Models/RagDocumentDetails.cs b/rag-api/Services/Contracts/Models/RagDocumentDetails.cs new file mode 100644 index 0000000..13aa6fc --- /dev/null +++ b/rag-api/Services/Contracts/Models/RagDocumentDetails.cs @@ -0,0 +1,13 @@ +namespace Api.Services.Contracts.Models +{ + public sealed class RagDocumentDetails + { + public required string Id { get; init; } + public required string DocumentType { get; init; } + public required string Title { get; init; } + public string? SourceUrl { get; init; } + public required string Text { get; init; } + public required string TextHash { get; init; } + public DateTimeOffset CreatedAt { get; init; } + } +} diff --git a/rag-api/Services/Contracts/Models/RagDocumentRecord.cs b/rag-api/Services/Contracts/Models/RagDocumentRecord.cs new file mode 100644 index 0000000..6198ceb --- /dev/null +++ b/rag-api/Services/Contracts/Models/RagDocumentRecord.cs @@ -0,0 +1,15 @@ +namespace Api.Services.Contracts.Models +{ + public sealed class RagDocumentRecord + { + public required string Id { get; init; } + public required string DocumentType { get; init; } + public required string Title { get; init; } + public string? SourceUrl { get; init; } + public required string Text { get; init; } + public required string TextHash { get; init; } + public double TypeConfidence { get; init; } + public string MetadataJson { get; init; } = "{}"; + public DateTimeOffset CreatedAt { get; init; } + } +} diff --git a/rag-api/Services/Contracts/Models/SearchCandidateChunk.cs b/rag-api/Services/Contracts/Models/SearchCandidateChunk.cs new file mode 100644 index 0000000..9e4167e --- /dev/null +++ b/rag-api/Services/Contracts/Models/SearchCandidateChunk.cs @@ -0,0 +1,9 @@ +namespace Api.Services.Contracts.Models +{ + public sealed class SearchCandidateChunk + { + public required RagDocumentRecord Document { get; init; } + public required RagChunkRecord Chunk { get; init; } + public double Score { get; init; } + } +} diff --git a/rag-api/Services/DocumentClassifier.cs b/rag-api/Services/DocumentClassifier.cs new file mode 100644 index 0000000..6d7dab7 --- /dev/null +++ b/rag-api/Services/DocumentClassifier.cs @@ -0,0 +1,65 @@ +using System.Text.RegularExpressions; +using Api.Services.Contracts; +using Api.Services.Contracts.Models; + +namespace Api.Services; + +public sealed class DocumentClassifier : IDocumentClassifier +{ + private static readonly HashSet KnownTypes = new(StringComparer.OrdinalIgnoreCase) + { + "cv", "job", "article", "contract", "invoice", "product", "documentation", "unknown" + }; + + public Task ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct) + { + if (!string.IsNullOrWhiteSpace(providedType)) + { + var normalized = NormalizeType(providedType); + return Task.FromResult(new DocumentClassification + { + DocumentType = normalized, + Confidence = KnownTypes.Contains(normalized) && normalized != "unknown" ? 1.0 : 0.6, + Title = BuildTitle(providedTitle, text, normalized) + }); + } + + var lower = text.ToLowerInvariant(); + var scores = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["cv"] = Count(lower, "curriculum vitae", "resume", "work experience", "professional experience", "education", "skills", "technologies", "linkedin", "github"), + ["job"] = Count(lower, "job description", "requirements", "responsibilities", "qualifications", "apply", "we are looking", "salary", "benefits", "remote", "hybrid"), + ["contract"] = Count(lower, "agreement", "contract", "party", "parties", "liability", "termination", "confidentiality", "governing law"), + ["invoice"] = Count(lower, "invoice", "vat", "subtotal", "total", "amount due", "due date", "billing"), + ["documentation"] = Count(lower, "api", "endpoint", "configuration", "install", "usage", "parameters", "response", "request"), + ["product"] = Count(lower, "features", "pricing", "sku", "product", "specification", "warranty") + }; + + var best = scores.OrderByDescending(x => x.Value).First(); + var type = best.Value <= 0 ? "unknown" : best.Key; + var confidence = best.Value <= 0 ? 0.25 : Math.Min(0.95, 0.45 + best.Value * 0.08); + + return Task.FromResult(new DocumentClassification + { + DocumentType = type, + Confidence = confidence, + Title = BuildTitle(providedTitle, text, type) + }); + } + + private static int Count(string lower, params string[] terms) => terms.Count(term => lower.Contains(term)); + + private static string NormalizeType(string value) + { + var cleaned = Regex.Replace(value.Trim().ToLowerInvariant(), "[^a-z0-9_-]", "-"); + return string.IsNullOrWhiteSpace(cleaned) ? "unknown" : cleaned; + } + + private static string BuildTitle(string? providedTitle, string text, string documentType) + { + if (!string.IsNullOrWhiteSpace(providedTitle)) return providedTitle.Trim(); + var firstLine = text.Split('.', '\n', '\r').Select(x => x.Trim()).FirstOrDefault(x => x.Length > 20); + if (!string.IsNullOrWhiteSpace(firstLine)) return firstLine.Length <= 120 ? firstLine : firstLine[..120]; + return $"{documentType} document"; + } +} diff --git a/rag-api/Services/HashHelper.cs b/rag-api/Services/HashHelper.cs new file mode 100644 index 0000000..8d55cfb --- /dev/null +++ b/rag-api/Services/HashHelper.cs @@ -0,0 +1,14 @@ +using System.Security.Cryptography; +using System.Text; + +namespace Api.Services; + +public static class HashHelper +{ + public static string Compute(string value) + { + using var sha = SHA256.Create(); + var bytes = sha.ComputeHash(Encoding.UTF8.GetBytes(value ?? string.Empty)); + return Convert.ToHexString(bytes); + } +} diff --git a/rag-api/Services/RagService.cs b/rag-api/Services/RagService.cs new file mode 100644 index 0000000..d86b55c --- /dev/null +++ b/rag-api/Services/RagService.cs @@ -0,0 +1,179 @@ +using System.Text.Json; +using Microsoft.Extensions.Options; +using Api.Services.Contracts; +using Api.Settings; +using Api.Responses; +using Api.Requests; +using Api.Services.Contracts.Models; + +namespace Api.Services; + +public sealed class RagService : IRagService +{ + private readonly ITextExtractor _textExtractor; + private readonly ITextChunker _chunker; + private readonly IDocumentClassifier _classifier; + private readonly IAiClient _ai; + private readonly IRagRepository _repository; + private readonly RagSettings _settings; + + public RagService( + ITextExtractor textExtractor, + ITextChunker chunker, + IDocumentClassifier classifier, + IAiClient ai, + IRagRepository repository, + IOptions options) + { + _textExtractor = textExtractor; + _chunker = chunker; + _classifier = classifier; + _ai = ai; + _repository = repository; + _settings = options.Value; + } + + public async Task IndexTextAsync(IndexDocumentRequest request, CancellationToken ct) + { + var text = _textExtractor.Normalize(request.Text ?? string.Empty); + if (text.Length < 40) throw new InvalidOperationException("Document text is too short."); + if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars]; + return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct); + } + + public async Task IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct) + { + if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty."); + if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB."); + if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint."); + + await using var stream = file.OpenReadStream(); + var text = await _textExtractor.ExtractPdfAsync(stream, ct); + if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars]; + if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF."); + return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary { ["fileName"] = file.FileName }, ct); + } + + public async Task SearchAsync(SearchRequest request, CancellationToken ct) + { + var query = _textExtractor.Normalize(request.QueryText); + if (query.Length < 10) throw new InvalidOperationException("Search query is too short."); + var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK)); + var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct); + var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct); + + var results = candidates + .GroupBy(x => x.Document.Id) + .Select(group => + { + var best = group.OrderByDescending(x => x.Score).First(); + return new SearchDocumentResult + { + DocumentId = best.Document.Id, + DocumentType = best.Document.DocumentType, + Title = best.Document.Title, + SourceUrl = best.Document.SourceUrl, + Score = group.Max(x => x.Score), + MatchedChunks = group + .OrderByDescending(x => x.Score) + .Take(3) + .Select(x => new SearchChunkResult + { + ChunkId = x.Chunk.Id, + ChunkIndex = x.Chunk.ChunkIndex, + Text = x.Chunk.Text, + Score = x.Score + }) + .ToList() + }; + }) + .OrderByDescending(x => x.Score) + .Take(topK) + .ToList(); + + return new SearchResponse { Results = results }; + } + + public async Task GetDocumentAsync(string documentId, CancellationToken ct) + { + var document = await _repository.GetDocumentByIdAsync(documentId, ct); + return document is null ? null : new RagDocumentDetails + { + Id = document.Id, + DocumentType = document.DocumentType, + Title = document.Title, + SourceUrl = document.SourceUrl, + Text = document.Text, + TextHash = document.TextHash, + CreatedAt = document.CreatedAt + }; + } + + private async Task IndexNormalizedTextAsync( + string text, + string? documentType, + string? title, + string? sourceUrl, + Dictionary? metadata, + CancellationToken ct) + { + var textHash = HashHelper.Compute(text); + var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct); + if (cached is not null) + { + return new IndexDocumentResponse + { + DocumentId = cached.Id, + TextHash = cached.TextHash, + DocumentType = cached.DocumentType, + DocumentTypeConfidence = cached.TypeConfidence, + Title = cached.Title, + Chunks = 0, + Characters = cached.Text.Length, + Cached = true + }; + } + + var classification = await _classifier.ClassifyAsync(text, documentType, title, ct); + var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap); + var document = new RagDocumentRecord + { + Id = Guid.NewGuid().ToString("N"), + DocumentType = classification.DocumentType, + Title = classification.Title, + SourceUrl = sourceUrl, + Text = text, + TextHash = textHash, + TypeConfidence = classification.Confidence, + MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata), + CreatedAt = DateTimeOffset.UtcNow + }; + + var records = new List(); + for (var i = 0; i < chunks.Count; i++) + { + ct.ThrowIfCancellationRequested(); + records.Add(new RagChunkRecord + { + Id = Guid.NewGuid().ToString("N"), + DocumentId = document.Id, + ChunkIndex = i, + Text = chunks[i], + Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct) + }); + } + + await _repository.SaveDocumentAsync(document, records, ct); + return new IndexDocumentResponse + { + DocumentId = document.Id, + TextHash = document.TextHash, + DocumentType = document.DocumentType, + DocumentTypeConfidence = document.TypeConfidence, + Title = document.Title, + Chunks = records.Count, + Characters = text.Length, + Cached = false + }; + } +} diff --git a/rag-api/Services/RawAiClient.cs b/rag-api/Services/RawAiClient.cs new file mode 100644 index 0000000..634df62 --- /dev/null +++ b/rag-api/Services/RawAiClient.cs @@ -0,0 +1,116 @@ +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; +using System.Text.Json.Serialization; +using Microsoft.Extensions.Options; +using Api.Services.Contracts; +using Api.Settings; + +namespace Api.Services; + +public sealed class RawAiClient : IAiClient +{ + private readonly HttpClient _http; + private readonly AiSettings _settings; + private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web) + { + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull + }; + + public RawAiClient(HttpClient http, IOptions options) + { + _http = http; + _settings = options.Value; + } + + public async Task CreateEmbeddingAsync(string input, CancellationToken ct) + { + return IsOllama() ? await CreateOllamaEmbeddingAsync(input, ct) : await CreateOpenAiEmbeddingAsync(input, ct); + } + + public async Task CreateChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + return IsOllama() + ? await CreateOllamaChatCompletionAsync(systemPrompt, userPrompt, temperature, ct) + : await CreateOpenAiChatCompletionAsync(systemPrompt, userPrompt, temperature, ct); + } + + private bool IsOllama() => string.Equals(_settings.Provider, "Ollama", StringComparison.OrdinalIgnoreCase); + + private async Task CreateOpenAiEmbeddingAsync(string input, CancellationToken ct) + { + if (string.IsNullOrWhiteSpace(_settings.OpenAI.ApiKey)) throw new InvalidOperationException("OpenAI API key is missing."); + using var request = new HttpRequestMessage(HttpMethod.Post, "https://api.openai.com/v1/embeddings"); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", _settings.OpenAI.ApiKey); + request.Content = ToJson(new { model = _settings.OpenAI.EmbeddingModel, input }); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(Math.Max(15, _settings.OpenAI.TimeoutSeconds))); + using var response = await _http.SendAsync(request, cts.Token); + var json = await response.Content.ReadAsStringAsync(cts.Token); + if (!response.IsSuccessStatusCode) throw new InvalidOperationException($"OpenAI embeddings failed: {(int)response.StatusCode} {json}"); + using var doc = JsonDocument.Parse(json); + return doc.RootElement.GetProperty("data")[0].GetProperty("embedding").EnumerateArray().Select(x => x.GetSingle()).ToArray(); + } + + private async Task CreateOpenAiChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + if (string.IsNullOrWhiteSpace(_settings.OpenAI.ApiKey)) throw new InvalidOperationException("OpenAI API key is missing."); + using var request = new HttpRequestMessage(HttpMethod.Post, "https://api.openai.com/v1/chat/completions"); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", _settings.OpenAI.ApiKey); + request.Content = ToJson(new + { + model = _settings.OpenAI.ChatModel, + temperature, + response_format = new { type = "json_object" }, + messages = new[] + { + new { role = "system", content = systemPrompt }, + new { role = "user", content = userPrompt } + } + }); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(Math.Max(15, _settings.OpenAI.TimeoutSeconds))); + using var response = await _http.SendAsync(request, cts.Token); + var json = await response.Content.ReadAsStringAsync(cts.Token); + if (!response.IsSuccessStatusCode) throw new InvalidOperationException($"OpenAI chat failed: {(int)response.StatusCode} {json}"); + using var doc = JsonDocument.Parse(json); + return doc.RootElement.GetProperty("choices")[0].GetProperty("message").GetProperty("content").GetString() ?? "{}"; + } + + private async Task CreateOllamaEmbeddingAsync(string input, CancellationToken ct) + { + var baseUrl = _settings.Ollama.BaseUrl.TrimEnd('/'); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(Math.Max(30, _settings.Ollama.TimeoutSeconds))); + using var response = await _http.PostAsync($"{baseUrl}/api/embeddings", ToJson(new { model = _settings.Ollama.EmbeddingModel, prompt = input }), cts.Token); + var json = await response.Content.ReadAsStringAsync(cts.Token); + if (!response.IsSuccessStatusCode) throw new InvalidOperationException($"Ollama embeddings failed: {(int)response.StatusCode} {json}"); + using var doc = JsonDocument.Parse(json); + return doc.RootElement.GetProperty("embedding").EnumerateArray().Select(x => x.GetSingle()).ToArray(); + } + + private async Task CreateOllamaChatCompletionAsync(string systemPrompt, string userPrompt, decimal temperature, CancellationToken ct) + { + var baseUrl = _settings.Ollama.BaseUrl.TrimEnd('/'); + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(Math.Max(30, _settings.Ollama.TimeoutSeconds))); + using var response = await _http.PostAsync($"{baseUrl}/api/chat", ToJson(new + { + model = _settings.Ollama.ChatModel, + stream = false, + format = "json", + messages = new[] + { + new { role = "system", content = systemPrompt }, + new { role = "user", content = userPrompt } + }, + options = new { temperature = (float)temperature } + }), cts.Token); + var json = await response.Content.ReadAsStringAsync(cts.Token); + if (!response.IsSuccessStatusCode) throw new InvalidOperationException($"Ollama chat failed: {(int)response.StatusCode} {json}"); + using var doc = JsonDocument.Parse(json); + return doc.RootElement.GetProperty("message").GetProperty("content").GetString() ?? "{}"; + } + + private static StringContent ToJson(T payload) => new(JsonSerializer.Serialize(payload, JsonOptions), Encoding.UTF8, "application/json"); +} diff --git a/rag-api/Services/SqlRagRepository.cs b/rag-api/Services/SqlRagRepository.cs new file mode 100644 index 0000000..730ca1d --- /dev/null +++ b/rag-api/Services/SqlRagRepository.cs @@ -0,0 +1,238 @@ +using Microsoft.Data.SqlClient; +using Api.Services.Contracts; +using Api.Services.Contracts.Models; + +namespace Api.Services; + +public sealed class SqlRagRepository : IRagRepository +{ + private readonly string _connectionString; + + public SqlRagRepository(IConfiguration configuration) + { + _connectionString = configuration.GetConnectionString("RagDb") + ?? throw new InvalidOperationException("Connection string 'RagDb' is missing."); + } + + public async Task InitializeAsync(CancellationToken ct) + { + await EnsureDatabaseExistsAsync(ct); + var sql = await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, "Database", "schema.sql"), ct); + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + foreach (var commandText in sql.Split("GO", StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) + { + await using var command = new SqlCommand(commandText, connection); + await command.ExecuteNonQueryAsync(ct); + } + } + + public async Task GetDocumentByTextHashAsync(string textHash, string? sourceUrl, CancellationToken ct) + { + const string sql = """ + SELECT TOP 1 Id, DocumentType, Title, SourceUrl, RawText, TextHash, TypeConfidence, MetadataJson, CreatedAt + FROM RagDocuments + WHERE TextHash = @TextHash AND (@SourceUrl IS NULL OR SourceUrl = @SourceUrl) + ORDER BY CreatedAt DESC + """; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@TextHash", textHash); + command.Parameters.AddWithValue("@SourceUrl", (object?)sourceUrl ?? DBNull.Value); + await using var reader = await command.ExecuteReaderAsync(ct); + return await reader.ReadAsync(ct) ? ReadDocument(reader) : null; + } + + public async Task GetDocumentByIdAsync(string id, CancellationToken ct) + { + const string sql = """ + SELECT Id, DocumentType, Title, SourceUrl, RawText, TextHash, TypeConfidence, MetadataJson, CreatedAt + FROM RagDocuments + WHERE Id = @Id + """; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@Id", id); + await using var reader = await command.ExecuteReaderAsync(ct); + return await reader.ReadAsync(ct) ? ReadDocument(reader) : null; + } + + public async Task SaveDocumentAsync(RagDocumentRecord document, IReadOnlyList chunks, CancellationToken ct) + { + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var tx = (SqlTransaction)await connection.BeginTransactionAsync(ct); + try + { + const string insertDoc = """ + INSERT INTO RagDocuments (Id, DocumentType, Title, SourceUrl, RawText, TextHash, TypeConfidence, MetadataJson, CreatedAt) + VALUES (@Id, @DocumentType, @Title, @SourceUrl, @RawText, @TextHash, @TypeConfidence, @MetadataJson, @CreatedAt) + """; + await using (var command = new SqlCommand(insertDoc, connection, tx)) + { + command.Parameters.AddWithValue("@Id", document.Id); + command.Parameters.AddWithValue("@DocumentType", document.DocumentType); + command.Parameters.AddWithValue("@Title", document.Title); + command.Parameters.AddWithValue("@SourceUrl", (object?)document.SourceUrl ?? DBNull.Value); + command.Parameters.AddWithValue("@RawText", document.Text); + command.Parameters.AddWithValue("@TextHash", document.TextHash); + command.Parameters.AddWithValue("@TypeConfidence", document.TypeConfidence); + command.Parameters.AddWithValue("@MetadataJson", document.MetadataJson); + command.Parameters.AddWithValue("@CreatedAt", document.CreatedAt.UtcDateTime); + await command.ExecuteNonQueryAsync(ct); + } + + const string insertChunk = """ + INSERT INTO RagChunks (Id, DocumentId, ChunkIndex, Text, Embedding) + VALUES (@Id, @DocumentId, @ChunkIndex, @Text, @Embedding) + """; + foreach (var chunk in chunks) + { + await using var command = new SqlCommand(insertChunk, connection, tx); + command.Parameters.AddWithValue("@Id", chunk.Id); + command.Parameters.AddWithValue("@DocumentId", document.Id); + command.Parameters.AddWithValue("@ChunkIndex", chunk.ChunkIndex); + command.Parameters.AddWithValue("@Text", chunk.Text); + command.Parameters.AddWithValue("@Embedding", VectorSerializer.ToBytes(chunk.Embedding)); + await command.ExecuteNonQueryAsync(ct); + } + await tx.CommitAsync(ct); + } + catch + { + await tx.RollbackAsync(ct); + throw; + } + } + + public async Task> SearchChunksAsync(float[] queryEmbedding, IReadOnlyList? targetTypes, int topK, CancellationToken ct) + { + var types = targetTypes?.Where(x => !string.IsNullOrWhiteSpace(x)).Select(x => x.Trim().ToLowerInvariant()).Distinct().ToArray() ?? []; + var sql = """ + SELECT d.Id, d.DocumentType, d.Title, d.SourceUrl, d.RawText, d.TextHash, d.TypeConfidence, d.MetadataJson, d.CreatedAt, + c.Id, c.DocumentId, c.ChunkIndex, c.Text, c.Embedding + FROM RagChunks c + INNER JOIN RagDocuments d ON d.Id = c.DocumentId + """; + + if (types.Length > 0) + { + sql += " WHERE LOWER(d.DocumentType) IN (" + string.Join(',', types.Select((_, i) => $"@Type{i}")) + ")"; + } + + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + for (var i = 0; i < types.Length; i++) command.Parameters.AddWithValue($"@Type{i}", types[i]); + await using var reader = await command.ExecuteReaderAsync(ct); + var candidates = new List(); + while (await reader.ReadAsync(ct)) + { + var doc = ReadDocument(reader, 0); + var chunk = new RagChunkRecord + { + Id = reader.GetString(9), + DocumentId = reader.GetString(10), + ChunkIndex = reader.GetInt32(11), + Text = reader.GetString(12), + Embedding = VectorSerializer.FromBytes((byte[])reader[13]) + }; + candidates.Add(new SearchCandidateChunk + { + Document = doc, + Chunk = chunk, + Score = VectorSerializer.CosineSimilarity(queryEmbedding, chunk.Embedding) + }); + } + + return candidates + .OrderByDescending(x => x.Score) + .Take(Math.Max(topK * 4, topK)) + .ToList(); + } + + public async Task GetEmbeddingAsync(string cacheKey, CancellationToken ct) + { + const string sql = "SELECT Vector FROM RagEmbeddingCache WHERE CacheKey = @CacheKey"; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CacheKey", cacheKey); + var value = await command.ExecuteScalarAsync(ct); + return value is byte[] bytes ? VectorSerializer.FromBytes(bytes) : null; + } + + public async Task SaveEmbeddingAsync(string cacheKey, string model, string textHash, float[] vector, CancellationToken ct) + { + const string sql = """ + IF NOT EXISTS (SELECT 1 FROM RagEmbeddingCache WHERE CacheKey = @CacheKey) + INSERT INTO RagEmbeddingCache (CacheKey, Model, TextHash, Vector, CreatedAt) + VALUES (@CacheKey, @Model, @TextHash, @Vector, SYSUTCDATETIME()) + """; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CacheKey", cacheKey); + command.Parameters.AddWithValue("@Model", model); + command.Parameters.AddWithValue("@TextHash", textHash); + command.Parameters.AddWithValue("@Vector", VectorSerializer.ToBytes(vector)); + await command.ExecuteNonQueryAsync(ct); + } + + public async Task GetChatCompletionAsync(string cacheKey, CancellationToken ct) + { + const string sql = "SELECT ResponseText FROM RagChatCompletionCache WHERE CacheKey = @CacheKey"; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CacheKey", cacheKey); + return await command.ExecuteScalarAsync(ct) as string; + } + + public async Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct) + { + const string sql = """ + IF NOT EXISTS (SELECT 1 FROM RagChatCompletionCache WHERE CacheKey = @CacheKey) + INSERT INTO RagChatCompletionCache (CacheKey, Model, Temperature, ResponseText, CreatedAt) + VALUES (@CacheKey, @Model, @Temperature, @ResponseText, SYSUTCDATETIME()) + """; + await using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(ct); + await using var command = new SqlCommand(sql, connection); + command.Parameters.AddWithValue("@CacheKey", cacheKey); + command.Parameters.AddWithValue("@Model", model); + command.Parameters.AddWithValue("@Temperature", temperature); + command.Parameters.AddWithValue("@ResponseText", responseText); + await command.ExecuteNonQueryAsync(ct); + } + + private static RagDocumentRecord ReadDocument(SqlDataReader reader, int offset = 0) => new() + { + Id = reader.GetString(offset), + DocumentType = reader.GetString(offset + 1), + Title = reader.GetString(offset + 2), + SourceUrl = reader.IsDBNull(offset + 3) ? null : reader.GetString(offset + 3), + Text = reader.GetString(offset + 4), + TextHash = reader.GetString(offset + 5), + TypeConfidence = Convert.ToDouble(reader.GetValue(offset + 6)), + MetadataJson = reader.GetString(offset + 7), + CreatedAt = new DateTimeOffset(reader.GetDateTime(offset + 8), TimeSpan.Zero) + }; + private async Task EnsureDatabaseExistsAsync(CancellationToken ct) + { + var builder = new SqlConnectionStringBuilder(_connectionString); + var databaseName = builder.InitialCatalog; + if (string.IsNullOrWhiteSpace(databaseName)) return; + + builder.InitialCatalog = "master"; + await using var connection = new SqlConnection(builder.ConnectionString); + await connection.OpenAsync(ct); + var safeName = databaseName.Replace("]", "]]" ); + await using var command = new SqlCommand($"IF DB_ID(@DatabaseName) IS NULL EXEC('CREATE DATABASE [{safeName}]')", connection); + command.Parameters.AddWithValue("@DatabaseName", databaseName); + await command.ExecuteNonQueryAsync(ct); + } + +} \ No newline at end of file diff --git a/api/Services/Rag/TextChunker.cs b/rag-api/Services/TextChunker.cs similarity index 72% rename from api/Services/Rag/TextChunker.cs rename to rag-api/Services/TextChunker.cs index 4e5daf2..434f2b9 100644 --- a/api/Services/Rag/TextChunker.cs +++ b/rag-api/Services/TextChunker.cs @@ -1,6 +1,6 @@ -using Api.Services.Contracts.Rag; +using Api.Services.Contracts; -namespace Api.Services.Rag; +namespace Api.Services; public sealed class TextChunker : ITextChunker { @@ -15,10 +15,10 @@ public sealed class TextChunker : ITextChunker while (start < text.Length) { var length = Math.Min(chunkSize, text.Length - start); - chunks.Add(text.Substring(start, length).Trim()); + var chunk = text.Substring(start, length).Trim(); + if (!string.IsNullOrWhiteSpace(chunk)) chunks.Add(chunk); start += chunkSize - overlap; } - - return chunks.Where(x => !string.IsNullOrWhiteSpace(x)).ToList(); + return chunks; } } diff --git a/rag-api/Services/TextExtractor.cs b/rag-api/Services/TextExtractor.cs new file mode 100644 index 0000000..78e85ca --- /dev/null +++ b/rag-api/Services/TextExtractor.cs @@ -0,0 +1,27 @@ +using System.Text; +using Api.Services.Contracts; +using UglyToad.PdfPig; + +namespace Api.Services; + +public sealed class TextExtractor : ITextExtractor +{ + public Task ExtractPdfAsync(Stream stream, CancellationToken ct) + { + using var document = PdfDocument.Open(stream); + var builder = new StringBuilder(); + foreach (var page in document.GetPages()) + { + ct.ThrowIfCancellationRequested(); + builder.AppendLine(page.Text); + builder.AppendLine(); + } + return Task.FromResult(Normalize(builder.ToString())); + } + + public string Normalize(string value) + { + if (string.IsNullOrWhiteSpace(value)) return string.Empty; + return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim(); + } +} diff --git a/rag-api/Services/VectorSerializer.cs b/rag-api/Services/VectorSerializer.cs new file mode 100644 index 0000000..9427b63 --- /dev/null +++ b/rag-api/Services/VectorSerializer.cs @@ -0,0 +1,31 @@ +namespace Api.Services; + +public static class VectorSerializer +{ + public static byte[] ToBytes(float[] vector) + { + var bytes = new byte[vector.Length * sizeof(float)]; + Buffer.BlockCopy(vector, 0, bytes, 0, bytes.Length); + return bytes; + } + + public static float[] FromBytes(byte[] bytes) + { + var vector = new float[bytes.Length / sizeof(float)]; + Buffer.BlockCopy(bytes, 0, vector, 0, bytes.Length); + return vector; + } + + public static double CosineSimilarity(float[] a, float[] b) + { + if (a.Length == 0 || a.Length != b.Length) return 0; + double dot = 0, magA = 0, magB = 0; + for (var i = 0; i < a.Length; i++) + { + dot += a[i] * b[i]; + magA += a[i] * a[i]; + magB += b[i] * b[i]; + } + return magA == 0 || magB == 0 ? 0 : dot / (Math.Sqrt(magA) * Math.Sqrt(magB)); + } +} diff --git a/rag-api/Settings/AiSettings.cs b/rag-api/Settings/AiSettings.cs new file mode 100644 index 0000000..41dd522 --- /dev/null +++ b/rag-api/Settings/AiSettings.cs @@ -0,0 +1,24 @@ +namespace Api.Settings; + +public sealed class AiSettings +{ + public string Provider { get; set; } = "OpenAI"; + public OpenAiProviderSettings OpenAI { get; set; } = new(); + public OllamaProviderSettings Ollama { get; set; } = new(); +} + +public sealed class OpenAiProviderSettings +{ + public string ApiKey { get; set; } = string.Empty; + public string ChatModel { get; set; } = "gpt-4o-mini"; + public string EmbeddingModel { get; set; } = "text-embedding-3-small"; + public int TimeoutSeconds { get; set; } = 90; +} + +public sealed class OllamaProviderSettings +{ + public string BaseUrl { get; set; } = "http://localhost:11434"; + public string ChatModel { get; set; } = "llama3.1:8b"; + public string EmbeddingModel { get; set; } = "nomic-embed-text"; + public int TimeoutSeconds { get; set; } = 180; +} diff --git a/rag-api/Settings/InternalApiSettings.cs b/rag-api/Settings/InternalApiSettings.cs new file mode 100644 index 0000000..a2662cd --- /dev/null +++ b/rag-api/Settings/InternalApiSettings.cs @@ -0,0 +1,7 @@ +namespace Api.Settings; + +public sealed class InternalApiSettings +{ + public string ApiKey { get; set; } = string.Empty; + public bool RequireApiKey { get; set; } = false; +} diff --git a/rag-api/Settings/RagSettings.cs b/rag-api/Settings/RagSettings.cs new file mode 100644 index 0000000..fdad4cc --- /dev/null +++ b/rag-api/Settings/RagSettings.cs @@ -0,0 +1,12 @@ +namespace Api.Settings; + +public sealed class RagSettings +{ + public int MaxFileSizeMb { get; set; } = 8; + public int ChunkSize { get; set; } = 900; + public int ChunkOverlap { get; set; } = 150; + public int MaxTextChars { get; set; } = 60000; + public int DefaultTopK { get; set; } = 20; + public int MaxTopK { get; set; } = 50; + public bool ClassifyWithAi { get; set; } = false; +} diff --git a/rag-api/appsettings.json b/rag-api/appsettings.json new file mode 100644 index 0000000..5a61b9a --- /dev/null +++ b/rag-api/appsettings.json @@ -0,0 +1,46 @@ +{ + "AllowedHosts": "*", + "Serilog": { + "MinimumLevel": { + "Default": "Information", + "Override": { + "Microsoft.AspNetCore": "Warning", + "System.Net.Http.HttpClient": "Warning" + } + }, + "WriteTo": [ + { "Name": "Console" } + ] + }, + "ConnectionStrings": { + "RagDb": "Server=localhost,1433;Database=MyAiRag;User Id=sa;Password=Your_strong_password123;TrustServerCertificate=True" + }, + "InternalApi": { + "ApiKey": "", + "RequireApiKey": false + }, + "Rag": { + "MaxFileSizeMb": 8, + "ChunkSize": 900, + "ChunkOverlap": 150, + "MaxTextChars": 60000, + "DefaultTopK": 20, + "MaxTopK": 50, + "ClassifyWithAi": false + }, + "Ai": { + "Provider": "OpenAI", + "OpenAI": { + "ApiKey": "", + "ChatModel": "gpt-4o-mini", + "EmbeddingModel": "text-embedding-3-small", + "TimeoutSeconds": 90 + }, + "Ollama": { + "BaseUrl": "http://localhost:11434", + "ChatModel": "llama3.1:8b", + "EmbeddingModel": "nomic-embed-text", + "TimeoutSeconds": 180 + } + } +} diff --git a/rag-api/rag-api.csproj b/rag-api/rag-api.csproj new file mode 100644 index 0000000..289ed9c --- /dev/null +++ b/rag-api/rag-api.csproj @@ -0,0 +1,26 @@ + + + net10.0 + enable + enable + Linux + Api + + + + + + + + + + + + + + + + PreserveNewest + + + diff --git a/web/wwwroot/js/myai.js b/web/wwwroot/js/myai.js index 3ccf031..b182d7d 100644 --- a/web/wwwroot/js/myai.js +++ b/web/wwwroot/js/myai.js @@ -89,7 +89,7 @@ "cv.noConsent": "GDPR consent is required.", "cv.processing": "Processing...", "cv.extracting": "Extracting CV and matching job...", - "cv.processingLong": "Processing CV PDF and job input.", + "cv.processingLong": "Processing CV PDF and job input. Backend endpoints must be available.", "cv.cvFailed": "CV extraction failed", "cv.matchFailed": "Job matching failed", "cv.completed": "Match completed.", @@ -182,7 +182,7 @@ "cv.noConsent": "Consimțământul GDPR este obligatoriu.", "cv.processing": "Se procesează...", "cv.extracting": "Se extrage CV-ul și se compară jobul...", - "cv.processingLong": "Se procesează PDF-ul și informațiile despre job.", + "cv.processingLong": "Se procesează PDF-ul și informațiile despre job. Endpoint-urile backend trebuie să fie disponibile.", "cv.cvFailed": "Extragerea CV-ului a eșuat", "cv.matchFailed": "Matching-ul jobului a eșuat", "cv.completed": "Matching finalizat.",