180 lines
7.2 KiB
C#
180 lines
7.2 KiB
C#
using System.Text.Json;
|
|
using Microsoft.Extensions.Options;
|
|
using Api.Services.Contracts;
|
|
using Api.Settings;
|
|
using Api.Responses;
|
|
using Api.Requests;
|
|
using Api.Services.Contracts.Models;
|
|
|
|
namespace Api.Services;
|
|
|
|
public sealed class RagService : IRagService
|
|
{
|
|
private readonly ITextExtractor _textExtractor;
|
|
private readonly ITextChunker _chunker;
|
|
private readonly IDocumentClassifier _classifier;
|
|
private readonly IAiClient _ai;
|
|
private readonly IRagRepository _repository;
|
|
private readonly RagSettings _settings;
|
|
|
|
public RagService(
|
|
ITextExtractor textExtractor,
|
|
ITextChunker chunker,
|
|
IDocumentClassifier classifier,
|
|
IAiClient ai,
|
|
IRagRepository repository,
|
|
IOptions<RagSettings> options)
|
|
{
|
|
_textExtractor = textExtractor;
|
|
_chunker = chunker;
|
|
_classifier = classifier;
|
|
_ai = ai;
|
|
_repository = repository;
|
|
_settings = options.Value;
|
|
}
|
|
|
|
public async Task<IndexDocumentResponse> IndexTextAsync(IndexDocumentRequest request, CancellationToken ct)
|
|
{
|
|
var text = _textExtractor.Normalize(request.Text ?? string.Empty);
|
|
if (text.Length < 40) throw new InvalidOperationException("Document text is too short.");
|
|
if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
|
|
return await IndexNormalizedTextAsync(text, request.DocumentType, request.Title, request.SourceUrl, request.Metadata, ct);
|
|
}
|
|
|
|
public async Task<IndexDocumentResponse> IndexPdfAsync(IFormFile file, string? documentType, string? title, string? sourceUrl, CancellationToken ct)
|
|
{
|
|
if (file.Length <= 0) throw new InvalidOperationException("Uploaded file is empty.");
|
|
if (file.Length > _settings.MaxFileSizeMb * 1024L * 1024L) throw new InvalidOperationException($"File is too large. Max size is {_settings.MaxFileSizeMb} MB.");
|
|
if (!string.Equals(Path.GetExtension(file.FileName), ".pdf", StringComparison.OrdinalIgnoreCase)) throw new InvalidOperationException("Only PDF files are supported by this endpoint.");
|
|
|
|
await using var stream = file.OpenReadStream();
|
|
var text = await _textExtractor.ExtractPdfAsync(stream, ct);
|
|
if (text.Length > _settings.MaxTextChars) text = text[.._settings.MaxTextChars];
|
|
if (text.Length < 40) throw new InvalidOperationException("Could not extract enough text from the PDF.");
|
|
return await IndexNormalizedTextAsync(text, documentType, title ?? file.FileName, sourceUrl, new Dictionary<string, string> { ["fileName"] = file.FileName }, ct);
|
|
}
|
|
|
|
public async Task<SearchResponse> SearchAsync(SearchRequest request, CancellationToken ct)
|
|
{
|
|
var query = _textExtractor.Normalize(request.QueryText);
|
|
if (query.Length < 10) throw new InvalidOperationException("Search query is too short.");
|
|
var topK = Math.Clamp(request.TopK ?? _settings.DefaultTopK, 1, Math.Max(1, _settings.MaxTopK));
|
|
var queryEmbedding = await _ai.CreateEmbeddingAsync(query, ct);
|
|
var candidates = await _repository.SearchChunksAsync(queryEmbedding, request.TargetDocumentTypes, topK, ct);
|
|
|
|
var results = candidates
|
|
.GroupBy(x => x.Document.Id)
|
|
.Select(group =>
|
|
{
|
|
var best = group.OrderByDescending(x => x.Score).First();
|
|
return new SearchDocumentResult
|
|
{
|
|
DocumentId = best.Document.Id,
|
|
DocumentType = best.Document.DocumentType,
|
|
Title = best.Document.Title,
|
|
SourceUrl = best.Document.SourceUrl,
|
|
Score = group.Max(x => x.Score),
|
|
MatchedChunks = group
|
|
.OrderByDescending(x => x.Score)
|
|
.Take(3)
|
|
.Select(x => new SearchChunkResult
|
|
{
|
|
ChunkId = x.Chunk.Id,
|
|
ChunkIndex = x.Chunk.ChunkIndex,
|
|
Text = x.Chunk.Text,
|
|
Score = x.Score
|
|
})
|
|
.ToList()
|
|
};
|
|
})
|
|
.OrderByDescending(x => x.Score)
|
|
.Take(topK)
|
|
.ToList();
|
|
|
|
return new SearchResponse { Results = results };
|
|
}
|
|
|
|
public async Task<RagDocumentDetails?> GetDocumentAsync(string documentId, CancellationToken ct)
|
|
{
|
|
var document = await _repository.GetDocumentByIdAsync(documentId, ct);
|
|
return document is null ? null : new RagDocumentDetails
|
|
{
|
|
Id = document.Id,
|
|
DocumentType = document.DocumentType,
|
|
Title = document.Title,
|
|
SourceUrl = document.SourceUrl,
|
|
Text = document.Text,
|
|
TextHash = document.TextHash,
|
|
CreatedAt = document.CreatedAt
|
|
};
|
|
}
|
|
|
|
private async Task<IndexDocumentResponse> IndexNormalizedTextAsync(
|
|
string text,
|
|
string? documentType,
|
|
string? title,
|
|
string? sourceUrl,
|
|
Dictionary<string, string>? metadata,
|
|
CancellationToken ct)
|
|
{
|
|
var textHash = HashHelper.Compute(text);
|
|
var cached = await _repository.GetDocumentByTextHashAsync(textHash, sourceUrl, ct);
|
|
if (cached is not null)
|
|
{
|
|
return new IndexDocumentResponse
|
|
{
|
|
DocumentId = cached.Id,
|
|
TextHash = cached.TextHash,
|
|
DocumentType = cached.DocumentType,
|
|
DocumentTypeConfidence = cached.TypeConfidence,
|
|
Title = cached.Title,
|
|
Chunks = 0,
|
|
Characters = cached.Text.Length,
|
|
Cached = true
|
|
};
|
|
}
|
|
|
|
var classification = await _classifier.ClassifyAsync(text, documentType, title, ct);
|
|
var chunks = _chunker.Chunk(text, _settings.ChunkSize, _settings.ChunkOverlap);
|
|
var document = new RagDocumentRecord
|
|
{
|
|
Id = Guid.NewGuid().ToString("N"),
|
|
DocumentType = classification.DocumentType,
|
|
Title = classification.Title,
|
|
SourceUrl = sourceUrl,
|
|
Text = text,
|
|
TextHash = textHash,
|
|
TypeConfidence = classification.Confidence,
|
|
MetadataJson = JsonSerializer.Serialize(metadata ?? classification.Metadata),
|
|
CreatedAt = DateTimeOffset.UtcNow
|
|
};
|
|
|
|
var records = new List<RagChunkRecord>();
|
|
for (var i = 0; i < chunks.Count; i++)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
records.Add(new RagChunkRecord
|
|
{
|
|
Id = Guid.NewGuid().ToString("N"),
|
|
DocumentId = document.Id,
|
|
ChunkIndex = i,
|
|
Text = chunks[i],
|
|
Embedding = await _ai.CreateEmbeddingAsync(chunks[i], ct)
|
|
});
|
|
}
|
|
|
|
await _repository.SaveDocumentAsync(document, records, ct);
|
|
return new IndexDocumentResponse
|
|
{
|
|
DocumentId = document.Id,
|
|
TextHash = document.TextHash,
|
|
DocumentType = document.DocumentType,
|
|
DocumentTypeConfidence = document.TypeConfidence,
|
|
Title = document.Title,
|
|
Chunks = records.Count,
|
|
Characters = text.Length,
|
|
Cached = false
|
|
};
|
|
}
|
|
}
|