using System.Diagnostics; using System.Net; using System.Text.RegularExpressions; using Microsoft.Extensions.Options; using Microsoft.Playwright; using PageFetcher.Data; using PageFetcher.Data.Entities; using PageFetcher.Models; using PageFetcherApi.Settings; namespace PageFetcherApi.Services; /// /// Fetches a web page via Playwright, extracts plain text, persists the result to the database, /// and returns a . /// public sealed class PageFetcherService { private readonly PlaywrightBrowserService _browserService; private readonly PageFetchDbContext _db; private readonly PageFetcherSettings _settings; private readonly ILogger _logger; public PageFetcherService( PlaywrightBrowserService browserService, PageFetchDbContext db, IOptions settings, ILogger logger) { _browserService = browserService; _db = db; _settings = settings.Value; _logger = logger; } /// /// Fetches the page at using Playwright, saves the fetch record, /// and returns the HTML and extracted text. /// Returns a failed response (with = false) rather than throwing /// on network or navigation errors. /// public async Task FetchAsync(FetchPageRequest request, CancellationToken ct) { var sw = Stopwatch.StartNew(); string html = string.Empty; string text = string.Empty; int? statusCode = null; bool success = false; string? errorMessage = null; string finalUrl = request.Url; try { var page = await _browserService.Browser.NewPageAsync(); await using var _ = page.ConfigureAwait(false); var waitUntil = request.WaitFor?.ToLowerInvariant() switch { "load" => WaitUntilState.Load, "domcontentloaded" => WaitUntilState.DOMContentLoaded, _ => WaitUntilState.NetworkIdle }; IResponse? response; try { response = await page.GotoAsync(request.Url, new PageGotoOptions { WaitUntil = waitUntil, Timeout = _settings.TimeoutSeconds * 1_000 }); } catch (TimeoutException) { _logger.LogWarning("Playwright NetworkIdle timeout for {Url}, using partial content", request.Url); response = null; } statusCode = response?.Status; finalUrl = page.Url; html = await page.ContentAsync(); text = ExtractText(html); success = true; _logger.LogInformation("Fetched {Url} → HTTP {Status} | HTML {HtmlLen} chars | text {TextLen} chars | {DurationMs} ms", request.Url, statusCode?.ToString() ?? "timeout", html.Length, text.Length, sw.ElapsedMilliseconds); } catch (Exception ex) { errorMessage = ex.Message; _logger.LogError(ex, "Failed to fetch {Url}", request.Url); } finally { sw.Stop(); } // Persist fetch record var entity = new PageFetchEntity { Id = Guid.NewGuid().ToString("N"), Url = request.Url, CallerService = request.CallerService ?? string.Empty, HttpStatusCode = statusCode, Html = html, Text = text, DurationMs = sw.ElapsedMilliseconds, Success = success, ErrorMessage = errorMessage }; _db.PageFetches.Add(entity); await _db.SaveChangesAsync(ct); return new FetchPageResponse { Url = finalUrl, StatusCode = statusCode ?? 0, Html = html, Text = text, Success = success, Error = errorMessage }; } /// /// Strips script/style blocks and all HTML tags from raw HTML, normalises whitespace, /// and truncates to . /// private string ExtractText(string html) { if (string.IsNullOrWhiteSpace(html)) return string.Empty; var text = html; text = Regex.Replace(text, "", " ", RegexOptions.IgnoreCase); text = Regex.Replace(text, "", " ", RegexOptions.IgnoreCase); text = Regex.Replace(text, "<[^>]+>", " "); text = WebUtility.HtmlDecode(text); text = string.Join(' ', text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim(); var max = Math.Max(4_000, _settings.MaxTextChars); return text.Length <= max ? text : text[..max]; } }