myAi/Apis/page-fetcher-api/Services/PageFetcherService.cs

using System.Diagnostics;
using System.Net;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Options;
using Microsoft.Playwright;
using PageFetcher.Data;
using PageFetcher.Data.Entities;
using PageFetcher.Models;
using PageFetcher.Models.Settings;

namespace Api.Services;

/// <summary>
/// Fetches a web page via Playwright, extracts plain text, persists the result to the database,
/// and returns a <see cref="FetchPageResponse"/>.
/// </summary>
public sealed class PageFetcherService
{
    private readonly PlaywrightBrowserService _browserService;
    private readonly PageFetchDbContext _db;
    private readonly PageFetcherSettings _settings;
    private readonly ILogger<PageFetcherService> _logger;

    public PageFetcherService(
        PlaywrightBrowserService browserService,
        PageFetchDbContext db,
        IOptions<PageFetcherSettings> settings,
        ILogger<PageFetcherService> logger)
    {
        _browserService = browserService;
        _db = db;
        _settings = settings.Value;
        _logger = logger;
    }

    /// <summary>
    /// Fetches the page at <paramref name="request.Url"/> using Playwright, saves the fetch record,
    /// and returns the HTML and extracted text.
    /// Returns a failed response (with <see cref="FetchPageResponse.Success"/> = false) rather than throwing
    /// on network or navigation errors.
    /// </summary>
    public async Task<FetchPageResponse> FetchAsync(FetchPageRequest request, CancellationToken ct)
    {
        var sw = Stopwatch.StartNew();
        string html = string.Empty;
        string text = string.Empty;
        int? statusCode = null;
        bool success = false;
        string? errorMessage = null;
        string finalUrl = request.Url;

        try
        {
            var page = await _browserService.Browser.NewPageAsync();
            await using var _ = page.ConfigureAwait(false);

            var waitUntil = request.WaitFor?.ToLowerInvariant() switch
            {
                "load" => WaitUntilState.Load,
                "domcontentloaded" => WaitUntilState.DOMContentLoaded,
                _ => WaitUntilState.NetworkIdle
            };

            IResponse? response;
            try
            {
                response = await page.GotoAsync(request.Url, new PageGotoOptions
                {
                    WaitUntil = waitUntil,
                    Timeout = _settings.TimeoutSeconds * 1_000
                });
            }
            catch (TimeoutException)
            {
                _logger.LogWarning("Playwright NetworkIdle timeout for {Url}, using partial content", request.Url);
                response = null;
            }

            statusCode = response?.Status;
            finalUrl = page.Url;
            html = await page.ContentAsync();
            text = ExtractText(html);
            success = true;

            _logger.LogInformation("Fetched {Url} → HTTP {Status} | HTML {HtmlLen} chars | text {TextLen} chars | {DurationMs} ms",
                request.Url, statusCode?.ToString() ?? "timeout", html.Length, text.Length, sw.ElapsedMilliseconds);
        }
        catch (Exception ex)
        {
            errorMessage = ex.Message;
            _logger.LogError(ex, "Failed to fetch {Url}", request.Url);
        }
        finally
        {
            sw.Stop();
        }

        // Persist fetch record
        var entity = new PageFetchEntity
        {
            Id = Guid.NewGuid().ToString("N"),
            Url = request.Url,
            CallerService = request.CallerService ?? string.Empty,
            JobSearchSessionId = request.JobSearchSessionId,
            HttpStatusCode = statusCode,
            Html = html,
            Text = text,
            DurationMs = sw.ElapsedMilliseconds,
            Success = success,
            ErrorMessage = errorMessage
        };

        _db.PageFetches.Add(entity);
        await _db.SaveChangesAsync(ct);

        return new FetchPageResponse
        {
            Url = finalUrl,
            StatusCode = statusCode ?? 0,
            Html = html,
            Text = text,
            Success = success,
            Error = errorMessage
        };
    }

    /// <summary>
    /// Strips script/style blocks and all HTML tags from raw HTML, normalises whitespace,
    /// and truncates to <see cref="PageFetcherSettings.MaxTextChars"/>.
    /// </summary>
    private string ExtractText(string html)
    {
        if (string.IsNullOrWhiteSpace(html)) return string.Empty;

        var text = html;
        text = Regex.Replace(text, "<script[\\s\\S]*?</script>", " ", RegexOptions.IgnoreCase);
        text = Regex.Replace(text, "<style[\\s\\S]*?</style>", " ", RegexOptions.IgnoreCase);
        text = Regex.Replace(text, "<[^>]+>", " ");
        text = WebUtility.HtmlDecode(text);
        text = string.Join(' ', text.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries)).Trim();

        var max = Math.Max(4_000, _settings.MaxTextChars);
        return text.Length <= max ? text : text[..max];
    }
}