Files
myAi/Apis/rag-api/Services/RagService.cs
T
claude e95ed36647 refactor: restructure solution into -models/-data/-api project taxonomy
Phases 1-10 of the planned refactoring:

Phase 1: rename shared-models -> common
  - namespace Shared.Models -> Common throughout
  - remove stale AspNetCore.Http.Features 5.0 reference

Phase 2: create shared-data with abstract BaseEntity
  - BaseEntity: required string Id { get; init; } + DateTime CreatedAt { get; init; }

Phase 3: rename myai-models -> myai-data
  - namespace MyAi.Models -> MyAi.Data
  - MigrationsAssembly("myai-data")

Phase 4: rename cv-search-models -> cv-search-data
  - namespace CvSearch.Models -> CvSearch.Data
  - move JobSearchSettings to cv-matcher-api-models
  - JobSearch*Entity now inherits BaseEntity

Phase 5: extract rag-data from rag-api
  - new project: Apis/rag-data with RagDbContext + entities + migrations
  - RagDocumentEntity inherits BaseEntity; cache entities use CacheKey PK
  - fix duplicate AddHttpClient<RagAiClient>/AddScoped registrations in rag-api
  - MigrationsAssembly("rag-data")

Phase 6: extract cv-matcher-data from cv-matcher-api
  - new project: Apis/cv-matcher-data with CvMatcherDbContext + entities + migrations
  - CvMatchResultEntity inherits BaseEntity; CvMatcherChatCacheEntity uses CacheKey PK
  - MigrationsAssembly("cv-matcher-data")

Phase 7: create empty cv-cleanup-job-models and cv-search-job-models

Phase 8: update all 5 Dockerfiles for renamed/new projects

Phase 9: reorganise .sln virtual folders (Apis/Jobs/Models/Data/Helpers)
  - update root CLAUDE.md with new project taxonomy and migration commands
  - update cv-matcher-api/CLAUDE.md and cv-search-job/CLAUDE.md

Phase 10: add Directory.Packages.props for centralised NuGet versions
  - remove Version= from all PackageReference elements in active .csproj files

No database changes. No runtime behaviour changes.
All MigrationId strings in __EFMigrationsHistory are unaffected.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 15:26:03 +03:00

183 lines
7.3 KiB
C#

using System.Text.Json;
using Microsoft.Extensions.Options;
using Api.Services.Contracts;
using Rag.Models.Requests;
using Rag.Models.Responses;
using Rag.Models.Settings;
using Rag.Data.Repositories.Contracts;
using Api.Clients.Ai.Contracts;
using Rag.Models;
using CommonHelpers;
namespace Api.Services;
public sealed class RagService : IRagService
{
private readonly ITextExtractor _textExtractor;
private readonly ITextChunker _chunker;
private readonly IDocumentClassifier _classifier;
private readonly IAiClient _ai;
private readonly IRagRepository _repository;
private readonly RagSettings _settings;
public RagService(
ITextExtractor textExtractor,
ITextChunker chunker,
IDocumentClassifier classifier,
IAiClient ai,
IRagRepository repository,
IOptions<RagSettings> options)
{
_textExtractor = textExtractor;
_chunker = chunker;
_classifier = classifier;
_ai = ai;
_repository = repository;
_settings = options.Value;
}
public async Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct)
{
var text = _textExtractor.Normalize(request.Text ?? string.Empty);
if (text.Length < 40) throw new InvalidOperationException("Document text is too short.");
if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct);
}
public async Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct)
{
if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty.");
if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB.");
if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint.");
await using var stream = file.OpenReadStream();
var text = await _textExtractor.ExtractPdfAsync(stream, ct);
if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF.");
return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary<string, string> { ["fileName"] = file.FileName }, ct);
}
public async Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct)
{
var query = _textExtractor.Normalize(request.QueryText);
if (query.Length < 10) throw new InvalidOperationException("Search query is too short.");
var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK));
var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct);
var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct);
var results = candidates
.GroupBy(x => x.Document.Id)
.Select(group =>
{
var best = group.OrderByDescending(x => x.Score).First();
return new SearchDocumentResult
{
DocumentId = best.Document.Id,
DocumentType = best.Document.DocumentType,
Title = best.Document.Title,
SourceUrl = best.Document.SourceUrl,
Score = group.Max(x => x.Score),
MatchedChunks = group
.OrderByDescending(x => x.Score)
.Take(3)
.Select(x => new SearchChunkResult
{
ChunkId = x.Chunk.Id,
ChunkIndex = x.Chunk.ChunkIndex,
Text = x.Chunk.Text,
Score = x.Score
})
.ToList()
};
})
.OrderByDescending(x => x.Score)
.Take(topK)
.ToList();
return new SearchResponse { Results = results };
}
public async Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct)
{
var document = await _repository.GetDocumentByIdAsync(documentId, ct);
return document is null ? null : new RagDocumentDetailsResponse
{
Id = document.Id,
DocumentType = document.DocumentType,
Title = document.Title,
SourceUrl = document.SourceUrl,
Text = document.Text,
TextHash = document.TextHash,
CreatedAt = document.CreatedAt
};
}
private async Task<IndexDocumentResponse> IndexNormalizedTextAsync(
string text,
string? documentType,
string? title,
string? sourceUrl,
Dictionary<string, string>? metadata,
CancellationToken ct)
{
var textHash = HashHelper.Compute(text);
var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct);
if (cached is not null)
{
return new IndexDocumentResponse
{
DocumentId = cached.Id,
TextHash = cached.TextHash,
DocumentType = cached.DocumentType,
DocumentTypeConfidence = cached.TypeConfidence,
Title = cached.Title,
Chunks = 0,
Characters = cached.Text.Length,
Cached = true
};
}
var classification = await _classifier.ClassifyAsync(text, documentType, title, ct);
var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap);
var document = new RagDocumentRecord
{
Id = Guid.NewGuid().ToString("N"),
DocumentType = classification.DocumentType,
Title = classification.Title,
SourceUrl = sourceUrl,
Text = text,
TextHash = textHash,
TypeConfidence = classification.Confidence,
MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata),
CreatedAt = DateTimeOffset.UtcNow
};
var records = new List<RagChunkRecord>();
for (var i = 0; i < chunks.Count; i++)
{
ct.ThrowIfCancellationRequested();
records.Add(new RagChunkRecord
{
Id = Guid.NewGuid().ToString("N"),
DocumentId = document.Id,
ChunkIndex = i,
Text = chunks[i],
Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct)
});
}
await _repository.SaveDocumentAsync(document, records, ct);
return new IndexDocumentResponse
{
DocumentId = document.Id,
TextHash = document.TextHash,
DocumentType = document.DocumentType,
DocumentTypeConfidence = document.TypeConfidence,
Title = document.Title,
Chunks = records.Count,
Characters = text.Length,
Cached = false
};
}
}