myAi/Apis/rag-api/Services/RagService.cs

using System.Text.Json;
using Microsoft.Extensions.Options;
using Api.Services.Contracts;
using Rag.Models.Requests;
using Rag.Models.Responses;
using Rag.Models.Settings;
using Rag.Data.Repositories.Contracts;
using Api.Clients.Ai.Contracts;
using Rag.Models;
using CommonHelpers;

namespace Api.Services;

/// <summary>
/// Implements the core RAG pipeline: document classification, chunking, embedding, vector search, and retrieval.
/// </summary>
public sealed class RagService : IRagService
{
    private readonly ITextExtractor _textExtractor;
    private readonly ITextChunker _chunker;
    private readonly IDocumentClassifier _classifier;
    private readonly IAiClient _ai;
    private readonly IRagRepository _repository;
    private readonly RagSettings _settings;

    public RagService(
        ITextExtractor textExtractor,
        ITextChunker chunker,
        IDocumentClassifier classifier,
        IAiClient ai,
        IRagRepository repository,
        IOptions<RagSettings> options)
    {
        _textExtractor = textExtractor;
        _chunker = chunker;
        _classifier = classifier;
        _ai = ai;
        _repository = repository;
        _settings = options.Value;
    }

    /// <inheritdoc />
    public async Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct)
    {
        var text = _textExtractor.Normalize(request.Text ?? string.Empty);
        if (text.Length < 40) throw new InvalidOperationException("Document text is too short.");
        if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
        return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct);
    }

    /// <inheritdoc />
    public async Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct)
    {
        if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty.");
        if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB.");
        if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint.");

        await using var stream = file.OpenReadStream();
        var text = await _textExtractor.ExtractPdfAsync(stream, ct);
        if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
        if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF.");
        return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary<string, string> { ["fileName"] = file.FileName }, ct);
    }

    /// <inheritdoc />
    public async Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct)
    {
        var query = _textExtractor.Normalize(request.QueryText);
        if (query.Length < 10) throw new InvalidOperationException("Search query is too short.");
        var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK));
        var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct);
        var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct);

        var results = candidates
            .GroupBy(x => x.Document.Id)
            .Select(group =>
            {
                var best = group.OrderByDescending(x => x.Score).First();
                return new SearchDocumentResult
                {
                    DocumentId = best.Document.Id,
                    DocumentType = best.Document.DocumentType,
                    Title = best.Document.Title,
                    SourceUrl = best.Document.SourceUrl,
                    Score = group.Max(x => x.Score),
                    MatchedChunks = group
                        .OrderByDescending(x => x.Score)
                        .Take(3)
                        .Select(x => new SearchChunkResult
                        {
                            ChunkId = x.Chunk.Id,
                            ChunkIndex = x.Chunk.ChunkIndex,
                            Text = x.Chunk.Text,
                            Score = x.Score
                        })
                        .ToList()
                };
            })
            .OrderByDescending(x => x.Score)
            .Take(topK)
            .ToList();

        return new SearchResponse { Results = results };
    }

    /// <inheritdoc />
    public async Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct)
    {
        var document = await _repository.GetDocumentByIdAsync(documentId, ct);
        return document is null ? null : new RagDocumentDetailsResponse
        {
            Id = document.Id,
            DocumentType = document.DocumentType,
            Title = document.Title,
            SourceUrl = document.SourceUrl,
            Text = document.Text,
            TextHash = document.TextHash,
            CreatedAt = document.CreatedAt
        };
    }

    /// <summary>
    /// Core indexing pipeline: computes a text hash for deduplication, classifies and chunks the text,
    /// generates embeddings for each chunk, and persists the document and chunks to the repository.
    /// Returns cached metadata without re-indexing when the same text hash and source URL already exist.
    /// </summary>
    private async Task<IndexDocumentResponse> IndexNormalizedTextAsync(
        string text,
        string? documentType,
        string? title,
        string? sourceUrl,
        Dictionary<string, string>? metadata,
        CancellationToken ct)
    {
        var textHash = HashHelper.Compute(text);
        var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct);
        if (cached is not null)
        {
            return new IndexDocumentResponse
            {
                DocumentId = cached.Id,
                TextHash = cached.TextHash,
                DocumentType = cached.DocumentType,
                DocumentTypeConfidence = cached.TypeConfidence,
                Title = cached.Title,
                Chunks = 0,
                Characters = cached.Text.Length,
                Cached = true
            };
        }

        var classification = await _classifier.ClassifyAsync(text, documentType, title, ct);
        var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap);
        var document = new RagDocumentRecord
        {
            Id = Guid.NewGuid().ToString("N"),
            DocumentType = classification.DocumentType,
            Title = classification.Title,
            SourceUrl = sourceUrl,
            Text = text,
            TextHash = textHash,
            TypeConfidence = classification.Confidence,
            MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata),
            CreatedAt = DateTimeOffset.UtcNow
        };

        var records = new List<RagChunkRecord>();
        for (var i = 0; i < chunks.Count; i++)
        {
            ct.ThrowIfCancellationRequested();
            records.Add(new RagChunkRecord
            {
                Id = Guid.NewGuid().ToString("N"),
                DocumentId = document.Id,
                ChunkIndex = i,
                Text = chunks[i],
                Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct)
            });
        }

        await _repository.SaveDocumentAsync(document, records, ct);
        return new IndexDocumentResponse
        {
            DocumentId = document.Id,
            TextHash = document.TextHash,
            DocumentType = document.DocumentType,
            DocumentTypeConfidence = document.TypeConfidence,
            Title = document.Title,
            Chunks = records.Count,
            Characters = text.Length,
            Cached = false
        };
    }
}