Changes

2026-05-14 14:12:29 +03:00
parent 92278ae375
commit 75bc9509c5
137 changed files with 0 additions and 371 deletions
@@ -0,0 +1,8 @@
+using Rag.Models;
+
+namespace Api.Services.Contracts;
+
+public interface IDocumentClassifier
+{
+    Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct);
+}
@@ -0,0 +1,12 @@
+using Rag.Models.Requests;
+using Rag.Models.Responses;
+
+namespace Api.Services.Contracts;
+
+public interface IRagService
+{
+    Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct);
+    Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct);
+    Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct);
+    Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct);
+}
@@ -0,0 +1,6 @@
+namespace Api.Services.Contracts;
+
+public interface ITextChunker
+{
+    IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap);
+}
@@ -0,0 +1,7 @@
+namespace Api.Services.Contracts;
+
+public interface ITextExtractor
+{
+    Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct);
+    string Normalize(string value);
+}
@@ -0,0 +1,65 @@
+using System.Text.RegularExpressions;
+using Api.Services.Contracts;
+using Rag.Models;
+
+namespace Api.Services;
+
+public sealed class DocumentClassifier : IDocumentClassifier
+{
+    private static readonly HashSet<string> KnownTypes = new(StringComparer.OrdinalIgnoreCase)
+    {
+        "cv", "job", "article", "contract", "invoice", "product", "documentation", "unknown"
+    };
+
+    public Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct)
+    {
+        if (!string.IsNullOrWhiteSpace(providedType))
+        {
+            var normalized = NormalizeType(providedType);
+            return Task.FromResult(new DocumentClassification
+            {
+                DocumentType = normalized,
+                Confidence = KnownTypes.Contains(normalized) && normalized != "unknown" ? 1.0 : 0.6,
+                Title = BuildTitle(providedTitle, text, normalized)
+            });
+        }
+
+        var lower = text.ToLowerInvariant();
+        var scores = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase)
+        {
+            ["cv"] = Count(lower, "curriculum vitae", "resume", "work experience", "professional experience", "education", "skills", "technologies", "linkedin", "github"),
+            ["job"] = Count(lower, "job description", "requirements", "responsibilities", "qualifications", "apply", "we are looking", "salary", "benefits", "remote", "hybrid"),
+            ["contract"] = Count(lower, "agreement", "contract", "party", "parties", "liability", "termination", "confidentiality", "governing law"),
+            ["invoice"] = Count(lower, "invoice", "vat", "subtotal", "total", "amount due", "due date", "billing"),
+            ["documentation"] = Count(lower, "api", "endpoint", "configuration", "install", "usage", "parameters", "response", "request"),
+            ["product"] = Count(lower, "features", "pricing", "sku", "product", "specification", "warranty")
+        };
+
+        var best = scores.OrderByDescending(x => x.Value).First();
+        var type = best.Value <= 0 ? "unknown" : best.Key;
+        var confidence = best.Value <= 0 ? 0.25 : Math.Min(0.95, 0.45 + best.Value * 0.08);
+
+        return Task.FromResult(new DocumentClassification
+        {
+            DocumentType = type,
+            Confidence = confidence,
+            Title = BuildTitle(providedTitle, text, type)
+        });
+    }
+
+    private static int Count(string lower, params string[] terms) => terms.Count(term => lower.Contains(term));
+
+    private static string NormalizeType(string value)
+    {
+        var cleaned = Regex.Replace(value.Trim().ToLowerInvariant(), "[^a-z0-9_-]", "-");
+        return string.IsNullOrWhiteSpace(cleaned) ? "unknown" : cleaned;
+    }
+
+    private static string BuildTitle(string? providedTitle, string text, string documentType)
+    {
+        if (!string.IsNullOrWhiteSpace(providedTitle)) return providedTitle.Trim();
+        var firstLine = text.Split('.', '\n', '\r').Select(x => x.Trim()).FirstOrDefault(x => x.Length > 20);
+        if (!string.IsNullOrWhiteSpace(firstLine)) return firstLine.Length <= 120 ? firstLine : firstLine[..120];
+        return $"{documentType} document";
+    }
+}
@@ -0,0 +1,182 @@
+using System.Text.Json;
+using Microsoft.Extensions.Options;
+using Api.Services.Contracts;
+using Rag.Models.Requests;
+using Rag.Models.Responses;
+using Rag.Models.Settings;
+using Api.Data.Repositories.Contracts;
+using Api.Clients.Ai.Contracts;
+using Rag.Models;
+using CommonHelpers;
+
+namespace Api.Services;
+
+public sealed class RagService : IRagService
+{
+    private readonly ITextExtractor _textExtractor;
+    private readonly ITextChunker _chunker;
+    private readonly IDocumentClassifier _classifier;
+    private readonly IAiClient _ai;
+    private readonly IRagRepository _repository;
+    private readonly RagSettings _settings;
+
+    public RagService(
+        ITextExtractor textExtractor,
+        ITextChunker chunker,
+        IDocumentClassifier classifier,
+        IAiClient ai,
+        IRagRepository repository,
+        IOptions<RagSettings> options)
+    {
+        _textExtractor = textExtractor;
+        _chunker = chunker;
+        _classifier = classifier;
+        _ai = ai;
+        _repository = repository;
+        _settings = options.Value;
+    }
+
+    public async Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct)
+    {
+        var text = _textExtractor.Normalize(request.Text ?? string.Empty);
+        if (text.Length < 40) throw new InvalidOperationException("Document text is too short.");
+        if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
+        return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct);
+    }
+
+    public async Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct)
+    {
+        if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty.");
+        if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB.");
+        if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint.");
+
+        await using var stream = file.OpenReadStream();
+        var text = await _textExtractor.ExtractPdfAsync(stream, ct);
+        if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
+        if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF.");
+        return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary<string, string> { ["fileName"] = file.FileName }, ct);
+    }
+
+    public async Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct)
+    {
+        var query = _textExtractor.Normalize(request.QueryText);
+        if (query.Length < 10) throw new InvalidOperationException("Search query is too short.");
+        var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK));
+        var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct);
+        var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct);
+
+        var results = candidates
+            .GroupBy(x => x.Document.Id)
+            .Select(group =>
+            {
+                var best = group.OrderByDescending(x => x.Score).First();
+                return new SearchDocumentResult
+                {
+                    DocumentId = best.Document.Id,
+                    DocumentType = best.Document.DocumentType,
+                    Title = best.Document.Title,
+                    SourceUrl = best.Document.SourceUrl,
+                    Score = group.Max(x => x.Score),
+                    MatchedChunks = group
+                        .OrderByDescending(x => x.Score)
+                        .Take(3)
+                        .Select(x => new SearchChunkResult
+                        {
+                            ChunkId = x.Chunk.Id,
+                            ChunkIndex = x.Chunk.ChunkIndex,
+                            Text = x.Chunk.Text,
+                            Score = x.Score
+                        })
+                        .ToList()
+                };
+            })
+            .OrderByDescending(x => x.Score)
+            .Take(topK)
+            .ToList();
+
+        return new SearchResponse { Results = results };
+    }
+
+    public async Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct)
+    {
+        var document = await _repository.GetDocumentByIdAsync(documentId, ct);
+        return document is null ? null : new RagDocumentDetailsResponse
+        {
+            Id = document.Id,
+            DocumentType = document.DocumentType,
+            Title = document.Title,
+            SourceUrl = document.SourceUrl,
+            Text = document.Text,
+            TextHash = document.TextHash,
+            CreatedAt = document.CreatedAt
+        };
+    }
+
+    private async Task<IndexDocumentResponse> IndexNormalizedTextAsync(
+        string text,
+        string? documentType,
+        string? title,
+        string? sourceUrl,
+        Dictionary<string, string>? metadata,
+        CancellationToken ct)
+    {
+        var textHash = HashHelper.Compute(text);
+        var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct);
+        if (cached is not null)
+        {
+            return new IndexDocumentResponse
+            {
+                DocumentId = cached.Id,
+                TextHash = cached.TextHash,
+                DocumentType = cached.DocumentType,
+                DocumentTypeConfidence = cached.TypeConfidence,
+                Title = cached.Title,
+                Chunks = 0,
+                Characters = cached.Text.Length,
+                Cached = true
+            };
+        }
+
+        var classification = await _classifier.ClassifyAsync(text, documentType, title, ct);
+        var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap);
+        var document = new RagDocumentRecord
+        {
+            Id = Guid.NewGuid().ToString("N"),
+            DocumentType = classification.DocumentType,
+            Title = classification.Title,
+            SourceUrl = sourceUrl,
+            Text = text,
+            TextHash = textHash,
+            TypeConfidence = classification.Confidence,
+            MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata),
+            CreatedAt = DateTimeOffset.UtcNow
+        };
+
+        var records = new List<RagChunkRecord>();
+        for (var i = 0; i < chunks.Count; i++)
+        {
+            ct.ThrowIfCancellationRequested();
+            records.Add(new RagChunkRecord
+            {
+                Id = Guid.NewGuid().ToString("N"),
+                DocumentId = document.Id,
+                ChunkIndex = i,
+                Text = chunks[i],
+                Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct)
+            });
+        }
+
+        await _repository.SaveDocumentAsync(document, records, ct);
+        return new IndexDocumentResponse
+        {
+            DocumentId = document.Id,
+            TextHash = document.TextHash,
+            DocumentType = document.DocumentType,
+            DocumentTypeConfidence = document.TypeConfidence,
+            Title = document.Title,
+            Chunks = records.Count,
+            Characters = text.Length,
+            Cached = false
+        };
+    }
+}
@@ -0,0 +1,24 @@
+using Api.Services.Contracts;
+
+namespace Api.Services;
+
+public sealed class TextChunker : ITextChunker
+{
+    public IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap)
+    {
+        if (string.IsNullOrWhiteSpace(text)) return [];
+        chunkSize = Math.Clamp(chunkSize, 300, 3000);
+        overlap = Math.Clamp(overlap, 0, chunkSize / 2);
+
+        var chunks = new List<string>();
+        var start = 0;
+        while (start < text.Length)
+        {
+            var length = Math.Min(chunkSize, text.Length - start);
+            var chunk = text.Substring(start, length).Trim();
+            if (!string.IsNullOrWhiteSpace(chunk)) chunks.Add(chunk);
+            start += chunkSize - overlap;
+        }
+        return chunks;
+    }
+}
@@ -0,0 +1,27 @@
+using System.Text;
+using Api.Services.Contracts;
+using UglyToad.PdfPig;
+
+namespace Api.Services;
+
+public sealed class TextExtractor : ITextExtractor
+{
+    public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
+    {
+        using var document = PdfDocument.Open(stream);
+        var builder = new StringBuilder();
+        foreach (var page in document.GetPages())
+        {
+            ct.ThrowIfCancellationRequested();
+            builder.AppendLine(page.Text);
+            builder.AppendLine();
+        }
+        return Task.FromResult(Normalize(builder.ToString()));
+    }
+
+    public string Normalize(string value)
+    {
+        if (string.IsNullOrWhiteSpace(value)) return string.Empty;
+        return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
+    }
+}