using System.Text.Json; using Microsoft.Extensions.Options; using Api.Services.Contracts; using Rag.Models.Requests; using Rag.Models.Responses; using Rag.Models.Settings; using Api.Data.Repositories.Contracts; using Api.Clients.Ai.Contracts; using Api.Clients.Ai; using Rag.Models; namespace Api.Services; public sealed class RagService : IRagService { private readonly ITextExtractor _textExtractor; private readonly ITextChunker _chunker; private readonly IDocumentClassifier _classifier; private readonly IAiClient _ai; private readonly IRagRepository _repository; private readonly RagSettings _settings; public RagService( ITextExtractor textExtractor, ITextChunker chunker, IDocumentClassifier classifier, IAiClient ai, IRagRepository repository, IOptions options) { _textExtractor = textExtractor; _chunker = chunker; _classifier = classifier; _ai = ai; _repository = repository; _settings = options.Value; } public async Task IndexTextAsync(IndexDocumentRequest request, CancellationToken ct) { var text = _textExtractor.Normalize(request.Text ?? string.Empty); if (text.Length < 40) throw new InvalidOperationException("Document text is too short."); if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars]; return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct); } public async Task IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct) { if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty."); if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB."); if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint."); await using var stream = file.OpenReadStream(); var text = await _textExtractor.ExtractPdfAsync(stream, ct); if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars]; if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF."); return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary { ["fileName"] = file.FileName }, ct); } public async Task SearchAsync(SearchRequest request, CancellationToken ct) { var query = _textExtractor.Normalize(request.QueryText); if (query.Length < 10) throw new InvalidOperationException("Search query is too short."); var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK)); var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct); var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct); var results = candidates .GroupBy(x => x.Document.Id) .Select(group => { var best = group.OrderByDescending(x => x.Score).First(); return new SearchDocumentResult { DocumentId = best.Document.Id, DocumentType = best.Document.DocumentType, Title = best.Document.Title, SourceUrl = best.Document.SourceUrl, Score = group.Max(x => x.Score), MatchedChunks = group .OrderByDescending(x => x.Score) .Take(3) .Select(x => new SearchChunkResult { ChunkId = x.Chunk.Id, ChunkIndex = x.Chunk.ChunkIndex, Text = x.Chunk.Text, Score = x.Score }) .ToList() }; }) .OrderByDescending(x => x.Score) .Take(topK) .ToList(); return new SearchResponse { Results = results }; } public async Task GetDocumentAsync(string documentId, CancellationToken ct) { var document = await _repository.GetDocumentByIdAsync(documentId, ct); return document is null ? null : new RagDocumentDetailsResponse { Id = document.Id, DocumentType = document.DocumentType, Title = document.Title, SourceUrl = document.SourceUrl, Text = document.Text, TextHash = document.TextHash, CreatedAt = document.CreatedAt }; } private async Task IndexNormalizedTextAsync( string text, string? documentType, string? title, string? sourceUrl, Dictionary? metadata, CancellationToken ct) { var textHash = HashHelper.Compute(text); var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct); if (cached is not null) { return new IndexDocumentResponse { DocumentId = cached.Id, TextHash = cached.TextHash, DocumentType = cached.DocumentType, DocumentTypeConfidence = cached.TypeConfidence, Title = cached.Title, Chunks = 0, Characters = cached.Text.Length, Cached = true }; } var classification = await _classifier.ClassifyAsync(text, documentType, title, ct); var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap); var document = new RagDocumentRecord { Id = Guid.NewGuid().ToString("N"), DocumentType = classification.DocumentType, Title = classification.Title, SourceUrl = sourceUrl, Text = text, TextHash = textHash, TypeConfidence = classification.Confidence, MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata), CreatedAt = DateTimeOffset.UtcNow }; var records = new List(); for (var i = 0; i < chunks.Count; i++) { ct.ThrowIfCancellationRequested(); records.Add(new RagChunkRecord { Id = Guid.NewGuid().ToString("N"), DocumentId = document.Id, ChunkIndex = i, Text = chunks[i], Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct) }); } await _repository.SaveDocumentAsync(document, records, ct); return new IndexDocumentResponse { DocumentId = document.Id, TextHash = document.TextHash, DocumentType = document.DocumentType, DocumentTypeConfidence = document.TypeConfidence, Title = document.Title, Chunks = records.Count, Characters = text.Length, Cached = false }; } }