diff --git a/RAG_SPLIT_README.md b/RAG_SPLIT_README.md deleted file mode 100644 index 6a39dcc..0000000 --- a/RAG_SPLIT_README.md +++ /dev/null @@ -1,116 +0,0 @@ -# MyAi RAG split cleanup - -## Public `api` - -The existing `api` project is now only the public gateway for the existing frontend. - -It keeps: - -- contact API -- file download API -- Google/config APIs -- health API -- `api/rag/*` proxy endpoints - -It no longer contains local RAG processing code. The removed responsibilities are: - -- PDF extraction -- chunking -- embeddings -- vector storage -- OpenAI/Ollama calls -- job text extraction -- CV matching business logic - -`api/Controllers/RagController.cs` is intentionally kept. It proxies the current frontend calls: - -- `POST /api/rag/cv` -> `cv-matcher-api /api/cv/upload` -- `POST /api/rag/match-job` -> `cv-matcher-api /api/cv/match-job` - -Required public API config: - -```json -"CvMatcherApi": { - "BaseUrl": "http://cv-matcher-api:8080", - "InternalApiKey": "change-this-internal-key" -} -``` - -## `cv-matcher-api` - -Business API for CV/job workflows. - -Main endpoints: - -- `POST /api/cv/upload` -- `POST /api/cv/match-job` -- `POST /api/cv/find-jobs` -- `GET /health` -- Swagger: `/swagger` - -Responsibilities: - -- CV matcher business logic -- job URL/text extraction -- final LLM scoring -- result persistence -- email sending -- calls `rag-api` for generic semantic indexing/search - -## `rag-api` - -Generic semantic search API. - -Main endpoints: - -- `POST /api/rag/documents` -- `POST /api/rag/documents/json` -- `POST /api/rag/search` -- `GET /api/rag/documents/{id}` -- `GET /health` -- Swagger: `/swagger` - -Responsibilities: - -- generic document indexing -- automatic document type classification when type is missing -- PDF/text extraction -- chunking -- embedding creation -- embedding and chat completion cache -- semantic search over generic documents - -## Logging and Swagger - -All three APIs now have: - -- Serilog startup logging -- Serilog request logging -- structured JSON console logs -- health endpoint -- Swagger/OpenAPI support - -Swagger is enabled by default and can be disabled per service with: - -```json -"Swagger": { - "Enabled": false -} -``` - -## Internal API security - -Both internal APIs support API-key protection: - -```json -"InternalApi": { - "RequireApiKey": true, - "ApiKey": "change-this-internal-key" -} -``` - -Requests must include: - -```http -X-Internal-Api-Key: change-this-internal-key -``` diff --git a/cv-matcher-api/Data/CvMatcherDbContext.cs b/cv-matcher-api/Data/CvMatcherDbContext.cs new file mode 100644 index 0000000..37d2c4d --- /dev/null +++ b/cv-matcher-api/Data/CvMatcherDbContext.cs @@ -0,0 +1,40 @@ +using Api.Data.Entities; +using Microsoft.EntityFrameworkCore; + +namespace Api.Data; + +public sealed class CvMatcherDbContext : DbContext +{ + public CvMatcherDbContext(DbContextOptions options) : base(options) + { + } + + public DbSet CvMatchResults => Set(); + public DbSet CvMatcherChatCache => Set(); + + protected override void OnModelCreating(ModelBuilder modelBuilder) + { + modelBuilder.Entity(entity => + { + entity.ToTable("CvMatchResults"); + entity.HasKey(x => x.Id); + entity.Property(x => x.Id).HasMaxLength(64); + entity.Property(x => x.CvDocumentId).HasMaxLength(64).IsRequired(); + entity.Property(x => x.JobDocumentId).HasMaxLength(64).IsRequired(); + entity.Property(x => x.ResultJson).IsRequired(); + entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()"); + entity.HasIndex(x => new { x.CvDocumentId, x.JobDocumentId }).IsUnique(); + }); + + modelBuilder.Entity(entity => + { + entity.ToTable("CvMatcherChatCache"); + entity.HasKey(x => x.CacheKey); + entity.Property(x => x.CacheKey).HasMaxLength(64); + entity.Property(x => x.Model).HasMaxLength(120).IsRequired(); + entity.Property(x => x.Temperature).HasColumnType("decimal(4,2)"); + entity.Property(x => x.ResponseText).IsRequired(); + entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()"); + }); + } +} diff --git a/cv-matcher-api/Data/Entities/CvMatchResultEntity.cs b/cv-matcher-api/Data/Entities/CvMatchResultEntity.cs new file mode 100644 index 0000000..2776358 --- /dev/null +++ b/cv-matcher-api/Data/Entities/CvMatchResultEntity.cs @@ -0,0 +1,11 @@ +namespace Api.Data.Entities; + +public sealed class CvMatchResultEntity +{ + public string Id { get; set; } = string.Empty; + public string CvDocumentId { get; set; } = string.Empty; + public string JobDocumentId { get; set; } = string.Empty; + public string ResultJson { get; set; } = string.Empty; + public int Score { get; set; } + public DateTime CreatedAt { get; set; } = DateTime.UtcNow; +} diff --git a/cv-matcher-api/Data/Entities/CvMatcherChatCacheEntity.cs b/cv-matcher-api/Data/Entities/CvMatcherChatCacheEntity.cs new file mode 100644 index 0000000..4ad845a --- /dev/null +++ b/cv-matcher-api/Data/Entities/CvMatcherChatCacheEntity.cs @@ -0,0 +1,10 @@ +namespace Api.Data.Entities; + +public sealed class CvMatcherChatCacheEntity +{ + public string CacheKey { get; set; } = string.Empty; + public string Model { get; set; } = string.Empty; + public decimal Temperature { get; set; } + public string ResponseText { get; set; } = string.Empty; + public DateTime CreatedAt { get; set; } = DateTime.UtcNow; +} diff --git a/cv-matcher-api/Program.cs b/cv-matcher-api/Program.cs index ef641ec..151984d 100644 --- a/cv-matcher-api/Program.cs +++ b/cv-matcher-api/Program.cs @@ -1,10 +1,12 @@ using Azure.Identity; +using Api.Data; using Api.Services; using Api.Services.Contracts; using Api.Settings; using Microsoft.AspNetCore.Diagnostics; using Serilog; using System.Reflection; +using Microsoft.EntityFrameworkCore; DotNetEnv.Env.Load(); @@ -69,7 +71,10 @@ try builder.Services.AddHttpClient(); builder.Services.AddHttpClient(); builder.Services.AddHttpClient(); - builder.Services.AddSingleton(); + builder.Services.AddDbContext(options => + options.UseSqlServer(builder.Configuration.GetConnectionString("CvMatcherDb") + ?? throw new InvalidOperationException("Connection string 'CvMatcherDb' is missing."))); + builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddSingleton(); @@ -160,6 +165,14 @@ try app.MapControllers(); app.MapGet("/health", () => Results.Ok(new { status = "ok", service = "cv-matcher-api", version = appVersion, timeUtc = DateTimeOffset.UtcNow })); + + Log.Information("Running EfCore DbMigrations if any"); + using (var scope = app.Services.CreateScope()) + { + var db = scope.ServiceProvider.GetRequiredService(); + db.Database.Migrate(); + } + Log.Information("{Service} startup complete", "cv-matcher-api"); app.Run(); } diff --git a/cv-matcher-api/Services/EfMatcherRepository.cs b/cv-matcher-api/Services/EfMatcherRepository.cs new file mode 100644 index 0000000..0e84095 --- /dev/null +++ b/cv-matcher-api/Services/EfMatcherRepository.cs @@ -0,0 +1,88 @@ +using System.Text.Json; +using Api.Data; +using Api.Data.Entities; +using Api.Responses; +using Api.Services.Contracts; +using Microsoft.EntityFrameworkCore; + +namespace Api.Services; + +public sealed class EfMatcherRepository : IMatcherRepository +{ + private readonly CvMatcherDbContext _db; + private readonly ILogger _logger; + + public EfMatcherRepository(CvMatcherDbContext db, ILogger logger) + { + _db = db; + _logger = logger; + } + + public async Task InitializeAsync(CancellationToken ct) + { + _logger.LogInformation("Ensuring CV matcher database schema exists using EF Core"); + await _db.Database.EnsureCreatedAsync(ct); + } + + public async Task GetMatchAsync(string cvDocumentId, string jobDocumentId, CancellationToken ct) + { + var json = await _db.CvMatchResults + .AsNoTracking() + .Where(x => x.CvDocumentId == cvDocumentId && x.JobDocumentId == jobDocumentId) + .Select(x => x.ResultJson) + .FirstOrDefaultAsync(ct); + + if (string.IsNullOrWhiteSpace(json)) return null; + + var result = JsonSerializer.Deserialize(json, new JsonSerializerOptions(JsonSerializerDefaults.Web)); + if (result is not null) result.Cached = true; + return result; + } + + public async Task SaveMatchAsync(string cvDocumentId, string jobDocumentId, JobMatchResponse response, CancellationToken ct) + { + var exists = await _db.CvMatchResults.AnyAsync( + x => x.CvDocumentId == cvDocumentId && x.JobDocumentId == jobDocumentId, + ct); + + if (exists) return; + + _db.CvMatchResults.Add(new CvMatchResultEntity + { + Id = Guid.NewGuid().ToString("N"), + CvDocumentId = cvDocumentId, + JobDocumentId = jobDocumentId, + ResultJson = JsonSerializer.Serialize(response, new JsonSerializerOptions(JsonSerializerDefaults.Web)), + Score = response.Score, + CreatedAt = DateTime.UtcNow + }); + + await _db.SaveChangesAsync(ct); + } + + public async Task GetChatCompletionAsync(string cacheKey, CancellationToken ct) + { + return await _db.CvMatcherChatCache + .AsNoTracking() + .Where(x => x.CacheKey == cacheKey) + .Select(x => x.ResponseText) + .FirstOrDefaultAsync(ct); + } + + public async Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct) + { + var exists = await _db.CvMatcherChatCache.AnyAsync(x => x.CacheKey == cacheKey, ct); + if (exists) return; + + _db.CvMatcherChatCache.Add(new CvMatcherChatCacheEntity + { + CacheKey = cacheKey, + Model = model, + Temperature = temperature, + ResponseText = responseText, + CreatedAt = DateTime.UtcNow + }); + + await _db.SaveChangesAsync(ct); + } +} diff --git a/cv-matcher-api/Services/SqlMatcherRepository.cs b/cv-matcher-api/Services/SqlMatcherRepository.cs deleted file mode 100644 index 6afba3c..0000000 --- a/cv-matcher-api/Services/SqlMatcherRepository.cs +++ /dev/null @@ -1,105 +0,0 @@ -using System.Text.Json; -using Api.Responses; -using Api.Services.Contracts; -using Microsoft.Data.SqlClient; - -namespace Api.Services; - -public sealed class SqlMatcherRepository : IMatcherRepository -{ - private readonly string _connectionString; - - public SqlMatcherRepository(IConfiguration configuration) - { - _connectionString = configuration.GetConnectionString("CvMatcherDb") - ?? throw new InvalidOperationException("Connection string 'CvMatcherDb' is missing."); - } - - public async Task InitializeAsync(CancellationToken ct) - { - await EnsureDatabaseExistsAsync(ct); - var sql = await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, "Database", "schema.sql"), ct); - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - foreach (var commandText in sql.Split("GO", StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) - { - await using var command = new SqlCommand(commandText, connection); - await command.ExecuteNonQueryAsync(ct); - } - } - - public async Task GetMatchAsync(string cvDocumentId, string jobDocumentId, CancellationToken ct) - { - const string sql = "SELECT ResultJson FROM CvMatchResults WHERE CvDocumentId = @CvDocumentId AND JobDocumentId = @JobDocumentId"; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CvDocumentId", cvDocumentId); - command.Parameters.AddWithValue("@JobDocumentId", jobDocumentId); - var json = await command.ExecuteScalarAsync(ct) as string; - if (string.IsNullOrWhiteSpace(json)) return null; - var result = JsonSerializer.Deserialize(json, new JsonSerializerOptions(JsonSerializerDefaults.Web)); - if (result is not null) result.Cached = true; - return result; - } - - public async Task SaveMatchAsync(string cvDocumentId, string jobDocumentId, JobMatchResponse response, CancellationToken ct) - { - const string sql = """ - IF NOT EXISTS (SELECT 1 FROM CvMatchResults WHERE CvDocumentId = @CvDocumentId AND JobDocumentId = @JobDocumentId) - INSERT INTO CvMatchResults (Id, CvDocumentId, JobDocumentId, ResultJson, Score, CreatedAt) - VALUES (@Id, @CvDocumentId, @JobDocumentId, @ResultJson, @Score, SYSUTCDATETIME()) - """; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@Id", Guid.NewGuid().ToString("N")); - command.Parameters.AddWithValue("@CvDocumentId", cvDocumentId); - command.Parameters.AddWithValue("@JobDocumentId", jobDocumentId); - command.Parameters.AddWithValue("@ResultJson", JsonSerializer.Serialize(response, new JsonSerializerOptions(JsonSerializerDefaults.Web))); - command.Parameters.AddWithValue("@Score", response.Score); - await command.ExecuteNonQueryAsync(ct); - } - - public async Task GetChatCompletionAsync(string cacheKey, CancellationToken ct) - { - const string sql = "SELECT ResponseText FROM CvMatcherChatCache WHERE CacheKey = @CacheKey"; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CacheKey", cacheKey); - return await command.ExecuteScalarAsync(ct) as string; - } - - public async Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct) - { - const string sql = """ - IF NOT EXISTS (SELECT 1 FROM CvMatcherChatCache WHERE CacheKey = @CacheKey) - INSERT INTO CvMatcherChatCache (CacheKey, Model, Temperature, ResponseText, CreatedAt) - VALUES (@CacheKey, @Model, @Temperature, @ResponseText, SYSUTCDATETIME()) - """; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CacheKey", cacheKey); - command.Parameters.AddWithValue("@Model", model); - command.Parameters.AddWithValue("@Temperature", temperature); - command.Parameters.AddWithValue("@ResponseText", responseText); - await command.ExecuteNonQueryAsync(ct); - } - private async Task EnsureDatabaseExistsAsync(CancellationToken ct) - { - var builder = new SqlConnectionStringBuilder(_connectionString); - var databaseName = builder.InitialCatalog; - if (string.IsNullOrWhiteSpace(databaseName)) return; - - builder.InitialCatalog = "master"; - await using var connection = new SqlConnection(builder.ConnectionString); - await connection.OpenAsync(ct); - var safeName = databaseName.Replace("]", "]]" ); - await using var command = new SqlCommand($"IF DB_ID(@DatabaseName) IS NULL EXEC('CREATE DATABASE [{safeName}]')", connection); - command.Parameters.AddWithValue("@DatabaseName", databaseName); - await command.ExecuteNonQueryAsync(ct); - } - -} \ No newline at end of file diff --git a/cv-matcher-api/cv-matcher-api.csproj b/cv-matcher-api/cv-matcher-api.csproj index 09be60c..1ebc13d 100644 --- a/cv-matcher-api/cv-matcher-api.csproj +++ b/cv-matcher-api/cv-matcher-api.csproj @@ -11,8 +11,12 @@ + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + - diff --git a/rag-api/Data/Entities/RagChatCompletionCacheEntity.cs b/rag-api/Data/Entities/RagChatCompletionCacheEntity.cs new file mode 100644 index 0000000..05940b9 --- /dev/null +++ b/rag-api/Data/Entities/RagChatCompletionCacheEntity.cs @@ -0,0 +1,10 @@ +namespace Api.Data.Entities; + +public sealed class RagChatCompletionCacheEntity +{ + public string CacheKey { get; set; } = string.Empty; + public string Model { get; set; } = string.Empty; + public decimal Temperature { get; set; } + public string ResponseText { get; set; } = string.Empty; + public DateTime CreatedAt { get; set; } = DateTime.UtcNow; +} diff --git a/rag-api/Data/Entities/RagChunkEntity.cs b/rag-api/Data/Entities/RagChunkEntity.cs new file mode 100644 index 0000000..b57467c --- /dev/null +++ b/rag-api/Data/Entities/RagChunkEntity.cs @@ -0,0 +1,12 @@ +namespace Api.Data.Entities; + +public sealed class RagChunkEntity +{ + public string Id { get; set; } = string.Empty; + public string DocumentId { get; set; } = string.Empty; + public int ChunkIndex { get; set; } + public string Text { get; set; } = string.Empty; + public byte[] Embedding { get; set; } = []; + + public RagDocumentEntity? Document { get; set; } +} diff --git a/rag-api/Data/Entities/RagDocumentEntity.cs b/rag-api/Data/Entities/RagDocumentEntity.cs new file mode 100644 index 0000000..739af12 --- /dev/null +++ b/rag-api/Data/Entities/RagDocumentEntity.cs @@ -0,0 +1,16 @@ +namespace Api.Data.Entities; + +public sealed class RagDocumentEntity +{ + public string Id { get; set; } = string.Empty; + public string DocumentType { get; set; } = string.Empty; + public string Title { get; set; } = string.Empty; + public string? SourceUrl { get; set; } + public string RawText { get; set; } = string.Empty; + public string TextHash { get; set; } = string.Empty; + public double TypeConfidence { get; set; } + public string MetadataJson { get; set; } = "{}"; + public DateTime CreatedAt { get; set; } = DateTime.UtcNow; + + public ICollection Chunks { get; set; } = []; +} diff --git a/rag-api/Data/Entities/RagEmbeddingCacheEntity.cs b/rag-api/Data/Entities/RagEmbeddingCacheEntity.cs new file mode 100644 index 0000000..63f8132 --- /dev/null +++ b/rag-api/Data/Entities/RagEmbeddingCacheEntity.cs @@ -0,0 +1,10 @@ +namespace Api.Data.Entities; + +public sealed class RagEmbeddingCacheEntity +{ + public string CacheKey { get; set; } = string.Empty; + public string Model { get; set; } = string.Empty; + public string TextHash { get; set; } = string.Empty; + public byte[] Vector { get; set; } = []; + public DateTime CreatedAt { get; set; } = DateTime.UtcNow; +} diff --git a/rag-api/Data/RagDbContext.cs b/rag-api/Data/RagDbContext.cs new file mode 100644 index 0000000..11a1642 --- /dev/null +++ b/rag-api/Data/RagDbContext.cs @@ -0,0 +1,72 @@ +using Api.Data.Entities; +using Microsoft.EntityFrameworkCore; + +namespace Api.Data; + +public sealed class RagDbContext : DbContext +{ + public RagDbContext(DbContextOptions options) : base(options) + { + } + + public DbSet RagDocuments => Set(); + public DbSet RagChunks => Set(); + public DbSet RagEmbeddingCache => Set(); + public DbSet RagChatCompletionCache => Set(); + + protected override void OnModelCreating(ModelBuilder modelBuilder) + { + modelBuilder.Entity(entity => + { + entity.ToTable("RagDocuments"); + entity.HasKey(x => x.Id); + entity.Property(x => x.Id).HasMaxLength(64); + entity.Property(x => x.DocumentType).HasMaxLength(80).IsRequired(); + entity.Property(x => x.Title).HasMaxLength(300).IsRequired(); + entity.Property(x => x.SourceUrl).HasMaxLength(1200); + entity.Property(x => x.RawText).IsRequired(); + entity.Property(x => x.TextHash).HasMaxLength(64).IsRequired(); + entity.Property(x => x.MetadataJson).HasDefaultValue("{}").IsRequired(); + entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()"); + entity.HasIndex(x => x.TextHash); + entity.HasIndex(x => x.DocumentType); + }); + + modelBuilder.Entity(entity => + { + entity.ToTable("RagChunks"); + entity.HasKey(x => x.Id); + entity.Property(x => x.Id).HasMaxLength(64); + entity.Property(x => x.DocumentId).HasMaxLength(64).IsRequired(); + entity.Property(x => x.Text).IsRequired(); + entity.Property(x => x.Embedding).IsRequired(); + entity.HasOne(x => x.Document) + .WithMany(x => x.Chunks) + .HasForeignKey(x => x.DocumentId) + .OnDelete(DeleteBehavior.Cascade); + }); + + modelBuilder.Entity(entity => + { + entity.ToTable("RagEmbeddingCache"); + entity.HasKey(x => x.CacheKey); + entity.Property(x => x.CacheKey).HasMaxLength(64); + entity.Property(x => x.Model).HasMaxLength(120).IsRequired(); + entity.Property(x => x.TextHash).HasMaxLength(64).IsRequired(); + entity.Property(x => x.Vector).IsRequired(); + entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()"); + entity.HasIndex(x => x.TextHash); + }); + + modelBuilder.Entity(entity => + { + entity.ToTable("RagChatCompletionCache"); + entity.HasKey(x => x.CacheKey); + entity.Property(x => x.CacheKey).HasMaxLength(64); + entity.Property(x => x.Model).HasMaxLength(120).IsRequired(); + entity.Property(x => x.Temperature).HasColumnType("decimal(4,2)"); + entity.Property(x => x.ResponseText).IsRequired(); + entity.Property(x => x.CreatedAt).HasDefaultValueSql("SYSUTCDATETIME()"); + }); + } +} diff --git a/rag-api/Program.cs b/rag-api/Program.cs index dd5a067..5613c81 100644 --- a/rag-api/Program.cs +++ b/rag-api/Program.cs @@ -1,10 +1,12 @@ using Azure.Identity; using Microsoft.AspNetCore.Diagnostics; +using Api.Data; using Api.Services; using Api.Services.Contracts; using Api.Settings; using Serilog; using System.Reflection; +using Microsoft.EntityFrameworkCore; DotNetEnv.Env.Load(); @@ -64,8 +66,12 @@ try builder.Services.Configure(builder.Configuration.GetSection("Ai")); builder.Services.Configure(builder.Configuration.GetSection("InternalApi")); + builder.Services.AddDbContext(options => + options.UseSqlServer(builder.Configuration.GetConnectionString("RagDb") + ?? throw new InvalidOperationException("Connection string 'RagDb' is missing."))); + builder.Services.AddHttpClient(); - builder.Services.AddSingleton(); + builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); @@ -159,6 +165,13 @@ try app.MapControllers(); app.MapGet("/health", () => Results.Ok(new { status = "ok", service = "rag-api", version = appVersion, timeUtc = DateTimeOffset.UtcNow })); + Log.Information("Running EfCore DbMigrations if any"); + using (var scope = app.Services.CreateScope()) + { + var db = scope.ServiceProvider.GetRequiredService(); + db.Database.Migrate(); + } + Log.Information("{Service} startup complete", "rag-api"); app.Run(); } diff --git a/rag-api/Services/EfRagRepository.cs b/rag-api/Services/EfRagRepository.cs new file mode 100644 index 0000000..96365c2 --- /dev/null +++ b/rag-api/Services/EfRagRepository.cs @@ -0,0 +1,195 @@ +using Api.Data; +using Api.Data.Entities; +using Api.Services.Contracts; +using Api.Services.Contracts.Models; +using Microsoft.EntityFrameworkCore; + +namespace Api.Services; + +public sealed class EfRagRepository : IRagRepository +{ + private readonly RagDbContext _db; + private readonly ILogger _logger; + + public EfRagRepository(RagDbContext db, ILogger logger) + { + _db = db; + _logger = logger; + } + + public async Task InitializeAsync(CancellationToken ct) + { + _logger.LogInformation("Ensuring RAG database schema exists using EF Core"); + await _db.Database.EnsureCreatedAsync(ct); + } + + public async Task GetDocumentByTextHashAsync(string textHash, string? sourceUrl, CancellationToken ct) + { + var query = _db.RagDocuments + .AsNoTracking() + .Where(x => x.TextHash == textHash); + + if (!string.IsNullOrWhiteSpace(sourceUrl)) + { + query = query.Where(x => x.SourceUrl == sourceUrl); + } + + var entity = await query + .OrderByDescending(x => x.CreatedAt) + .FirstOrDefaultAsync(ct); + + return entity is null ? null : ToRecord(entity); + } + + public async Task GetDocumentByIdAsync(string id, CancellationToken ct) + { + var entity = await _db.RagDocuments + .AsNoTracking() + .FirstOrDefaultAsync(x => x.Id == id, ct); + + return entity is null ? null : ToRecord(entity); + } + + public async Task SaveDocumentAsync(RagDocumentRecord document, IReadOnlyList chunks, CancellationToken ct) + { + var exists = await _db.RagDocuments.AnyAsync(x => x.Id == document.Id, ct); + if (exists) + { + _logger.LogInformation("RAG document already exists. DocumentId={DocumentId}", document.Id); + return; + } + + var entity = new RagDocumentEntity + { + Id = document.Id, + DocumentType = document.DocumentType, + Title = document.Title, + SourceUrl = document.SourceUrl, + RawText = document.Text, + TextHash = document.TextHash, + TypeConfidence = document.TypeConfidence, + MetadataJson = document.MetadataJson, + CreatedAt = document.CreatedAt.UtcDateTime, + Chunks = chunks.Select(chunk => new RagChunkEntity + { + Id = chunk.Id, + DocumentId = chunk.DocumentId, + ChunkIndex = chunk.ChunkIndex, + Text = chunk.Text, + Embedding = VectorSerializer.ToBytes(chunk.Embedding) + }).ToList() + }; + + _db.RagDocuments.Add(entity); + await _db.SaveChangesAsync(ct); + } + + public async Task> SearchChunksAsync( + float[] queryEmbedding, + IReadOnlyList? targetTypes, + int topK, + CancellationToken ct) + { + var types = targetTypes? + .Where(x => !string.IsNullOrWhiteSpace(x)) + .Select(x => x.Trim().ToLowerInvariant()) + .Distinct() + .ToArray() ?? []; + + var query = _db.RagChunks + .AsNoTracking() + .Include(x => x.Document) + .AsQueryable(); + + if (types.Length > 0) + { + query = query.Where(x => x.Document != null && types.Contains(x.Document.DocumentType.ToLower())); + } + + var rows = await query.ToListAsync(ct); + + return rows + .Where(x => x.Document is not null) + .Select(x => new SearchCandidateChunk + { + Document = ToRecord(x.Document!), + Chunk = new RagChunkRecord + { + Id = x.Id, + DocumentId = x.DocumentId, + ChunkIndex = x.ChunkIndex, + Text = x.Text, + Embedding = VectorSerializer.FromBytes(x.Embedding) + }, + Score = VectorSerializer.CosineSimilarity(queryEmbedding, VectorSerializer.FromBytes(x.Embedding)) + }) + .OrderByDescending(x => x.Score) + .Take(Math.Max(topK * 4, topK)) + .ToList(); + } + + public async Task GetEmbeddingAsync(string cacheKey, CancellationToken ct) + { + var entry = await _db.RagEmbeddingCache + .AsNoTracking() + .FirstOrDefaultAsync(x => x.CacheKey == cacheKey, ct); + + return entry is null ? null : VectorSerializer.FromBytes(entry.Vector); + } + + public async Task SaveEmbeddingAsync(string cacheKey, string model, string textHash, float[] vector, CancellationToken ct) + { + var exists = await _db.RagEmbeddingCache.AnyAsync(x => x.CacheKey == cacheKey, ct); + if (exists) return; + + _db.RagEmbeddingCache.Add(new RagEmbeddingCacheEntity + { + CacheKey = cacheKey, + Model = model, + TextHash = textHash, + Vector = VectorSerializer.ToBytes(vector), + CreatedAt = DateTime.UtcNow + }); + + await _db.SaveChangesAsync(ct); + } + + public async Task GetChatCompletionAsync(string cacheKey, CancellationToken ct) + { + return await _db.RagChatCompletionCache + .AsNoTracking() + .Where(x => x.CacheKey == cacheKey) + .Select(x => x.ResponseText) + .FirstOrDefaultAsync(ct); + } + + public async Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct) + { + var exists = await _db.RagChatCompletionCache.AnyAsync(x => x.CacheKey == cacheKey, ct); + if (exists) return; + + _db.RagChatCompletionCache.Add(new RagChatCompletionCacheEntity + { + CacheKey = cacheKey, + Model = model, + Temperature = temperature, + ResponseText = responseText, + CreatedAt = DateTime.UtcNow + }); + + await _db.SaveChangesAsync(ct); + } + + private static RagDocumentRecord ToRecord(RagDocumentEntity entity) => new() + { + Id = entity.Id, + DocumentType = entity.DocumentType, + Title = entity.Title, + SourceUrl = entity.SourceUrl, + Text = entity.RawText, + TextHash = entity.TextHash, + TypeConfidence = entity.TypeConfidence, + MetadataJson = entity.MetadataJson, + CreatedAt = new DateTimeOffset(DateTime.SpecifyKind(entity.CreatedAt, DateTimeKind.Utc)) + }; +} diff --git a/rag-api/Services/SqlRagRepository.cs b/rag-api/Services/SqlRagRepository.cs deleted file mode 100644 index 730ca1d..0000000 --- a/rag-api/Services/SqlRagRepository.cs +++ /dev/null @@ -1,238 +0,0 @@ -using Microsoft.Data.SqlClient; -using Api.Services.Contracts; -using Api.Services.Contracts.Models; - -namespace Api.Services; - -public sealed class SqlRagRepository : IRagRepository -{ - private readonly string _connectionString; - - public SqlRagRepository(IConfiguration configuration) - { - _connectionString = configuration.GetConnectionString("RagDb") - ?? throw new InvalidOperationException("Connection string 'RagDb' is missing."); - } - - public async Task InitializeAsync(CancellationToken ct) - { - await EnsureDatabaseExistsAsync(ct); - var sql = await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, "Database", "schema.sql"), ct); - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - foreach (var commandText in sql.Split("GO", StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) - { - await using var command = new SqlCommand(commandText, connection); - await command.ExecuteNonQueryAsync(ct); - } - } - - public async Task GetDocumentByTextHashAsync(string textHash, string? sourceUrl, CancellationToken ct) - { - const string sql = """ - SELECT TOP 1 Id, DocumentType, Title, SourceUrl, RawText, TextHash, TypeConfidence, MetadataJson, CreatedAt - FROM RagDocuments - WHERE TextHash = @TextHash AND (@SourceUrl IS NULL OR SourceUrl = @SourceUrl) - ORDER BY CreatedAt DESC - """; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@TextHash", textHash); - command.Parameters.AddWithValue("@SourceUrl", (object?)sourceUrl ?? DBNull.Value); - await using var reader = await command.ExecuteReaderAsync(ct); - return await reader.ReadAsync(ct) ? ReadDocument(reader) : null; - } - - public async Task GetDocumentByIdAsync(string id, CancellationToken ct) - { - const string sql = """ - SELECT Id, DocumentType, Title, SourceUrl, RawText, TextHash, TypeConfidence, MetadataJson, CreatedAt - FROM RagDocuments - WHERE Id = @Id - """; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@Id", id); - await using var reader = await command.ExecuteReaderAsync(ct); - return await reader.ReadAsync(ct) ? ReadDocument(reader) : null; - } - - public async Task SaveDocumentAsync(RagDocumentRecord document, IReadOnlyList chunks, CancellationToken ct) - { - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var tx = (SqlTransaction)await connection.BeginTransactionAsync(ct); - try - { - const string insertDoc = """ - INSERT INTO RagDocuments (Id, DocumentType, Title, SourceUrl, RawText, TextHash, TypeConfidence, MetadataJson, CreatedAt) - VALUES (@Id, @DocumentType, @Title, @SourceUrl, @RawText, @TextHash, @TypeConfidence, @MetadataJson, @CreatedAt) - """; - await using (var command = new SqlCommand(insertDoc, connection, tx)) - { - command.Parameters.AddWithValue("@Id", document.Id); - command.Parameters.AddWithValue("@DocumentType", document.DocumentType); - command.Parameters.AddWithValue("@Title", document.Title); - command.Parameters.AddWithValue("@SourceUrl", (object?)document.SourceUrl ?? DBNull.Value); - command.Parameters.AddWithValue("@RawText", document.Text); - command.Parameters.AddWithValue("@TextHash", document.TextHash); - command.Parameters.AddWithValue("@TypeConfidence", document.TypeConfidence); - command.Parameters.AddWithValue("@MetadataJson", document.MetadataJson); - command.Parameters.AddWithValue("@CreatedAt", document.CreatedAt.UtcDateTime); - await command.ExecuteNonQueryAsync(ct); - } - - const string insertChunk = """ - INSERT INTO RagChunks (Id, DocumentId, ChunkIndex, Text, Embedding) - VALUES (@Id, @DocumentId, @ChunkIndex, @Text, @Embedding) - """; - foreach (var chunk in chunks) - { - await using var command = new SqlCommand(insertChunk, connection, tx); - command.Parameters.AddWithValue("@Id", chunk.Id); - command.Parameters.AddWithValue("@DocumentId", document.Id); - command.Parameters.AddWithValue("@ChunkIndex", chunk.ChunkIndex); - command.Parameters.AddWithValue("@Text", chunk.Text); - command.Parameters.AddWithValue("@Embedding", VectorSerializer.ToBytes(chunk.Embedding)); - await command.ExecuteNonQueryAsync(ct); - } - await tx.CommitAsync(ct); - } - catch - { - await tx.RollbackAsync(ct); - throw; - } - } - - public async Task> SearchChunksAsync(float[] queryEmbedding, IReadOnlyList? targetTypes, int topK, CancellationToken ct) - { - var types = targetTypes?.Where(x => !string.IsNullOrWhiteSpace(x)).Select(x => x.Trim().ToLowerInvariant()).Distinct().ToArray() ?? []; - var sql = """ - SELECT d.Id, d.DocumentType, d.Title, d.SourceUrl, d.RawText, d.TextHash, d.TypeConfidence, d.MetadataJson, d.CreatedAt, - c.Id, c.DocumentId, c.ChunkIndex, c.Text, c.Embedding - FROM RagChunks c - INNER JOIN RagDocuments d ON d.Id = c.DocumentId - """; - - if (types.Length > 0) - { - sql += " WHERE LOWER(d.DocumentType) IN (" + string.Join(',', types.Select((_, i) => $"@Type{i}")) + ")"; - } - - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - for (var i = 0; i < types.Length; i++) command.Parameters.AddWithValue($"@Type{i}", types[i]); - await using var reader = await command.ExecuteReaderAsync(ct); - var candidates = new List(); - while (await reader.ReadAsync(ct)) - { - var doc = ReadDocument(reader, 0); - var chunk = new RagChunkRecord - { - Id = reader.GetString(9), - DocumentId = reader.GetString(10), - ChunkIndex = reader.GetInt32(11), - Text = reader.GetString(12), - Embedding = VectorSerializer.FromBytes((byte[])reader[13]) - }; - candidates.Add(new SearchCandidateChunk - { - Document = doc, - Chunk = chunk, - Score = VectorSerializer.CosineSimilarity(queryEmbedding, chunk.Embedding) - }); - } - - return candidates - .OrderByDescending(x => x.Score) - .Take(Math.Max(topK * 4, topK)) - .ToList(); - } - - public async Task GetEmbeddingAsync(string cacheKey, CancellationToken ct) - { - const string sql = "SELECT Vector FROM RagEmbeddingCache WHERE CacheKey = @CacheKey"; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CacheKey", cacheKey); - var value = await command.ExecuteScalarAsync(ct); - return value is byte[] bytes ? VectorSerializer.FromBytes(bytes) : null; - } - - public async Task SaveEmbeddingAsync(string cacheKey, string model, string textHash, float[] vector, CancellationToken ct) - { - const string sql = """ - IF NOT EXISTS (SELECT 1 FROM RagEmbeddingCache WHERE CacheKey = @CacheKey) - INSERT INTO RagEmbeddingCache (CacheKey, Model, TextHash, Vector, CreatedAt) - VALUES (@CacheKey, @Model, @TextHash, @Vector, SYSUTCDATETIME()) - """; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CacheKey", cacheKey); - command.Parameters.AddWithValue("@Model", model); - command.Parameters.AddWithValue("@TextHash", textHash); - command.Parameters.AddWithValue("@Vector", VectorSerializer.ToBytes(vector)); - await command.ExecuteNonQueryAsync(ct); - } - - public async Task GetChatCompletionAsync(string cacheKey, CancellationToken ct) - { - const string sql = "SELECT ResponseText FROM RagChatCompletionCache WHERE CacheKey = @CacheKey"; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CacheKey", cacheKey); - return await command.ExecuteScalarAsync(ct) as string; - } - - public async Task SaveChatCompletionAsync(string cacheKey, string model, decimal temperature, string responseText, CancellationToken ct) - { - const string sql = """ - IF NOT EXISTS (SELECT 1 FROM RagChatCompletionCache WHERE CacheKey = @CacheKey) - INSERT INTO RagChatCompletionCache (CacheKey, Model, Temperature, ResponseText, CreatedAt) - VALUES (@CacheKey, @Model, @Temperature, @ResponseText, SYSUTCDATETIME()) - """; - await using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(ct); - await using var command = new SqlCommand(sql, connection); - command.Parameters.AddWithValue("@CacheKey", cacheKey); - command.Parameters.AddWithValue("@Model", model); - command.Parameters.AddWithValue("@Temperature", temperature); - command.Parameters.AddWithValue("@ResponseText", responseText); - await command.ExecuteNonQueryAsync(ct); - } - - private static RagDocumentRecord ReadDocument(SqlDataReader reader, int offset = 0) => new() - { - Id = reader.GetString(offset), - DocumentType = reader.GetString(offset + 1), - Title = reader.GetString(offset + 2), - SourceUrl = reader.IsDBNull(offset + 3) ? null : reader.GetString(offset + 3), - Text = reader.GetString(offset + 4), - TextHash = reader.GetString(offset + 5), - TypeConfidence = Convert.ToDouble(reader.GetValue(offset + 6)), - MetadataJson = reader.GetString(offset + 7), - CreatedAt = new DateTimeOffset(reader.GetDateTime(offset + 8), TimeSpan.Zero) - }; - private async Task EnsureDatabaseExistsAsync(CancellationToken ct) - { - var builder = new SqlConnectionStringBuilder(_connectionString); - var databaseName = builder.InitialCatalog; - if (string.IsNullOrWhiteSpace(databaseName)) return; - - builder.InitialCatalog = "master"; - await using var connection = new SqlConnection(builder.ConnectionString); - await connection.OpenAsync(ct); - var safeName = databaseName.Replace("]", "]]" ); - await using var command = new SqlCommand($"IF DB_ID(@DatabaseName) IS NULL EXEC('CREATE DATABASE [{safeName}]')", connection); - command.Parameters.AddWithValue("@DatabaseName", databaseName); - await command.ExecuteNonQueryAsync(ct); - } - -} \ No newline at end of file diff --git a/rag-api/rag-api.csproj b/rag-api/rag-api.csproj index d76b094..7485e85 100644 --- a/rag-api/rag-api.csproj +++ b/rag-api/rag-api.csproj @@ -11,7 +11,11 @@ - + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive +