This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
using Rag.Models;
|
||||
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
public interface IDocumentClassifier
|
||||
{
|
||||
Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
using Rag.Models.Requests;
|
||||
using Rag.Models.Responses;
|
||||
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
public interface IRagService
|
||||
{
|
||||
Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct);
|
||||
Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct);
|
||||
Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct);
|
||||
Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
public interface ITextChunker
|
||||
{
|
||||
IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap);
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
namespace Api.Services.Contracts;
|
||||
|
||||
public interface ITextExtractor
|
||||
{
|
||||
Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct);
|
||||
string Normalize(string value);
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using Api.Services.Contracts;
|
||||
using Rag.Models;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
public sealed class DocumentClassifier : IDocumentClassifier
|
||||
{
|
||||
private static readonly HashSet<string> KnownTypes = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"cv", "job", "article", "contract", "invoice", "product", "documentation", "unknown"
|
||||
};
|
||||
|
||||
public Task<DocumentClassification> ClassifyAsync(string text, string? providedType, string? providedTitle, CancellationToken ct)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(providedType))
|
||||
{
|
||||
var normalized = NormalizeType(providedType);
|
||||
return Task.FromResult(new DocumentClassification
|
||||
{
|
||||
DocumentType = normalized,
|
||||
Confidence = KnownTypes.Contains(normalized) && normalized != "unknown" ? 1.0 : 0.6,
|
||||
Title = BuildTitle(providedTitle, text, normalized)
|
||||
});
|
||||
}
|
||||
|
||||
var lower = text.ToLowerInvariant();
|
||||
var scores = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
["cv"] = Count(lower, "curriculum vitae", "resume", "work experience", "professional experience", "education", "skills", "technologies", "linkedin", "github"),
|
||||
["job"] = Count(lower, "job description", "requirements", "responsibilities", "qualifications", "apply", "we are looking", "salary", "benefits", "remote", "hybrid"),
|
||||
["contract"] = Count(lower, "agreement", "contract", "party", "parties", "liability", "termination", "confidentiality", "governing law"),
|
||||
["invoice"] = Count(lower, "invoice", "vat", "subtotal", "total", "amount due", "due date", "billing"),
|
||||
["documentation"] = Count(lower, "api", "endpoint", "configuration", "install", "usage", "parameters", "response", "request"),
|
||||
["product"] = Count(lower, "features", "pricing", "sku", "product", "specification", "warranty")
|
||||
};
|
||||
|
||||
var best = scores.OrderByDescending(x => x.Value).First();
|
||||
var type = best.Value <= 0 ? "unknown" : best.Key;
|
||||
var confidence = best.Value <= 0 ? 0.25 : Math.Min(0.95, 0.45 + best.Value * 0.08);
|
||||
|
||||
return Task.FromResult(new DocumentClassification
|
||||
{
|
||||
DocumentType = type,
|
||||
Confidence = confidence,
|
||||
Title = BuildTitle(providedTitle, text, type)
|
||||
});
|
||||
}
|
||||
|
||||
private static int Count(string lower, params string[] terms) => terms.Count(term => lower.Contains(term));
|
||||
|
||||
private static string NormalizeType(string value)
|
||||
{
|
||||
var cleaned = Regex.Replace(value.Trim().ToLowerInvariant(), "[^a-z0-9_-]", "-");
|
||||
return string.IsNullOrWhiteSpace(cleaned) ? "unknown" : cleaned;
|
||||
}
|
||||
|
||||
private static string BuildTitle(string? providedTitle, string text, string documentType)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(providedTitle)) return providedTitle.Trim();
|
||||
var firstLine = text.Split('.', '\n', '\r').Select(x => x.Trim()).FirstOrDefault(x => x.Length > 20);
|
||||
if (!string.IsNullOrWhiteSpace(firstLine)) return firstLine.Length <= 120 ? firstLine : firstLine[..120];
|
||||
return $"{documentType} document";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,182 @@
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Api.Services.Contracts;
|
||||
using Rag.Models.Requests;
|
||||
using Rag.Models.Responses;
|
||||
using Rag.Models.Settings;
|
||||
using Api.Data.Repositories.Contracts;
|
||||
using Api.Clients.Ai.Contracts;
|
||||
using Rag.Models;
|
||||
using CommonHelpers;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
public sealed class RagService : IRagService
|
||||
{
|
||||
private readonly ITextExtractor _textExtractor;
|
||||
private readonly ITextChunker _chunker;
|
||||
private readonly IDocumentClassifier _classifier;
|
||||
private readonly IAiClient _ai;
|
||||
private readonly IRagRepository _repository;
|
||||
private readonly RagSettings _settings;
|
||||
|
||||
public RagService(
|
||||
ITextExtractor textExtractor,
|
||||
ITextChunker chunker,
|
||||
IDocumentClassifier classifier,
|
||||
IAiClient ai,
|
||||
IRagRepository repository,
|
||||
IOptions<RagSettings> options)
|
||||
{
|
||||
_textExtractor = textExtractor;
|
||||
_chunker = chunker;
|
||||
_classifier = classifier;
|
||||
_ai = ai;
|
||||
_repository = repository;
|
||||
_settings = options.Value;
|
||||
}
|
||||
|
||||
public async Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct)
|
||||
{
|
||||
var text = _textExtractor.Normalize(request.Text ?? string.Empty);
|
||||
if (text.Length < 40) throw new InvalidOperationException("Document text is too short.");
|
||||
if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
|
||||
return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct);
|
||||
}
|
||||
|
||||
public async Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct)
|
||||
{
|
||||
if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty.");
|
||||
if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB.");
|
||||
if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint.");
|
||||
|
||||
await using var stream = file.OpenReadStream();
|
||||
var text = await _textExtractor.ExtractPdfAsync(stream, ct);
|
||||
if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
|
||||
if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF.");
|
||||
return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary<string, string> { ["fileName"] = file.FileName }, ct);
|
||||
}
|
||||
|
||||
public async Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct)
|
||||
{
|
||||
var query = _textExtractor.Normalize(request.QueryText);
|
||||
if (query.Length < 10) throw new InvalidOperationException("Search query is too short.");
|
||||
var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK));
|
||||
var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct);
|
||||
var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct);
|
||||
|
||||
var results = candidates
|
||||
.GroupBy(x => x.Document.Id)
|
||||
.Select(group =>
|
||||
{
|
||||
var best = group.OrderByDescending(x => x.Score).First();
|
||||
return new SearchDocumentResult
|
||||
{
|
||||
DocumentId = best.Document.Id,
|
||||
DocumentType = best.Document.DocumentType,
|
||||
Title = best.Document.Title,
|
||||
SourceUrl = best.Document.SourceUrl,
|
||||
Score = group.Max(x => x.Score),
|
||||
MatchedChunks = group
|
||||
.OrderByDescending(x => x.Score)
|
||||
.Take(3)
|
||||
.Select(x => new SearchChunkResult
|
||||
{
|
||||
ChunkId = x.Chunk.Id,
|
||||
ChunkIndex = x.Chunk.ChunkIndex,
|
||||
Text = x.Chunk.Text,
|
||||
Score = x.Score
|
||||
})
|
||||
.ToList()
|
||||
};
|
||||
})
|
||||
.OrderByDescending(x => x.Score)
|
||||
.Take(topK)
|
||||
.ToList();
|
||||
|
||||
return new SearchResponse { Results = results };
|
||||
}
|
||||
|
||||
public async Task<RagDocumentDetailsResponse?> GetDocumentAsync(string documentId, CancellationToken ct)
|
||||
{
|
||||
var document = await _repository.GetDocumentByIdAsync(documentId, ct);
|
||||
return document is null ? null : new RagDocumentDetailsResponse
|
||||
{
|
||||
Id = document.Id,
|
||||
DocumentType = document.DocumentType,
|
||||
Title = document.Title,
|
||||
SourceUrl = document.SourceUrl,
|
||||
Text = document.Text,
|
||||
TextHash = document.TextHash,
|
||||
CreatedAt = document.CreatedAt
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<IndexDocumentResponse> IndexNormalizedTextAsync(
|
||||
string text,
|
||||
string? documentType,
|
||||
string? title,
|
||||
string? sourceUrl,
|
||||
Dictionary<string, string>? metadata,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var textHash = HashHelper.Compute(text);
|
||||
var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct);
|
||||
if (cached is not null)
|
||||
{
|
||||
return new IndexDocumentResponse
|
||||
{
|
||||
DocumentId = cached.Id,
|
||||
TextHash = cached.TextHash,
|
||||
DocumentType = cached.DocumentType,
|
||||
DocumentTypeConfidence = cached.TypeConfidence,
|
||||
Title = cached.Title,
|
||||
Chunks = 0,
|
||||
Characters = cached.Text.Length,
|
||||
Cached = true
|
||||
};
|
||||
}
|
||||
|
||||
var classification = await _classifier.ClassifyAsync(text, documentType, title, ct);
|
||||
var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap);
|
||||
var document = new RagDocumentRecord
|
||||
{
|
||||
Id = Guid.NewGuid().ToString("N"),
|
||||
DocumentType = classification.DocumentType,
|
||||
Title = classification.Title,
|
||||
SourceUrl = sourceUrl,
|
||||
Text = text,
|
||||
TextHash = textHash,
|
||||
TypeConfidence = classification.Confidence,
|
||||
MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata),
|
||||
CreatedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
var records = new List<RagChunkRecord>();
|
||||
for (var i = 0; i < chunks.Count; i++)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
records.Add(new RagChunkRecord
|
||||
{
|
||||
Id = Guid.NewGuid().ToString("N"),
|
||||
DocumentId = document.Id,
|
||||
ChunkIndex = i,
|
||||
Text = chunks[i],
|
||||
Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct)
|
||||
});
|
||||
}
|
||||
|
||||
await _repository.SaveDocumentAsync(document, records, ct);
|
||||
return new IndexDocumentResponse
|
||||
{
|
||||
DocumentId = document.Id,
|
||||
TextHash = document.TextHash,
|
||||
DocumentType = document.DocumentType,
|
||||
DocumentTypeConfidence = document.TypeConfidence,
|
||||
Title = document.Title,
|
||||
Chunks = records.Count,
|
||||
Characters = text.Length,
|
||||
Cached = false
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
using Api.Services.Contracts;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
public sealed class TextChunker : ITextChunker
|
||||
{
|
||||
public IReadOnlyList<string> Chunk(string text, int chunkSize, int overlap)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return [];
|
||||
chunkSize = Math.Clamp(chunkSize, 300, 3000);
|
||||
overlap = Math.Clamp(overlap, 0, chunkSize / 2);
|
||||
|
||||
var chunks = new List<string>();
|
||||
var start = 0;
|
||||
while (start < text.Length)
|
||||
{
|
||||
var length = Math.Min(chunkSize, text.Length - start);
|
||||
var chunk = text.Substring(start, length).Trim();
|
||||
if (!string.IsNullOrWhiteSpace(chunk)) chunks.Add(chunk);
|
||||
start += chunkSize - overlap;
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
using System.Text;
|
||||
using Api.Services.Contracts;
|
||||
using UglyToad.PdfPig;
|
||||
|
||||
namespace Api.Services;
|
||||
|
||||
public sealed class TextExtractor : ITextExtractor
|
||||
{
|
||||
public Task<string> ExtractPdfAsync(Stream stream, CancellationToken ct)
|
||||
{
|
||||
using var document = PdfDocument.Open(stream);
|
||||
var builder = new StringBuilder();
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
builder.AppendLine(page.Text);
|
||||
builder.AppendLine();
|
||||
}
|
||||
return Task.FromResult(Normalize(builder.ToString()));
|
||||
}
|
||||
|
||||
public string Normalize(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value)) return string.Empty;
|
||||
return string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user