网站首页 网站源码
website
站点相关全部源代码,隐藏了一些关于服务器的信息
using System.Text.RegularExpressions;
using AngleSharp;
using Dpz.Core.EnumLibrary;
using Dpz.Core.Infrastructure;
using Dpz.Core.Public.ViewModel;
using Dpz.Core.Public.ViewModel.V4;
using Dpz.Core.Service.ObjectStorage.Services;
using Dpz.Core.Service.V4.Services;
using Dpz.Core.Web.Jobs.Services;
using Hangfire;
using Markdig;
using Microsoft.Toolkit.Parsers.Rss;
using MongoDB.Bson;
using ReverseMarkdown;

namespace Dpz.Core.Web.Jobs.Hangfire;

public class CnBetaTaskActivator : JobActivator
{

    /// <summary>
    /// 日志记录
    /// </summary>
    private readonly ILogger<CnBetaTaskActivator> _logger;

    /// <summary>
    /// 文章Service
    /// </summary>
    private readonly IArticleService _articleService;

    private readonly IObjectStorageOperation _objectStorageService;
    private readonly IHttpClientFactory _httpClientFactory;
    private readonly IPushMessage _pushMessage;

    public CnBetaTaskActivator(
        ILogger<CnBetaTaskActivator> logger,
        IArticleService articleService,
        IObjectStorageOperation objectStorageService,
        IHttpClientFactory httpClientFactory,
        IPushMessage pushMessage)
    {
        _logger = logger;
        _articleService = articleService;
        _objectStorageService = objectStorageService;
        _httpClientFactory = httpClientFactory;
        _pushMessage = pushMessage;
    }

    private async Task PushProgressMessage(string message, decimal progress, MessageType type = MessageType.Success)
    {
        var progressMessage = new ProgressMessage
        {
            ProgressValues = new[] { progress },
            Message = message,
            Type = type
        };
        await _pushMessage.PushCnBetaMessageAsync(progressMessage);
    }


    [ProlongExpirationTime]
    public async Task Start()
    {
        _logger.LogInformation("CnBeta任务开始");
        try
        {
            var source = (await GetHomeSourceAsync()); //.OrderBy(_ => Guid.NewGuid()).Take(1).ToList();
            _logger.LogInformation("获取到{Count}条数据", source.Count);
            // var noExists = await _blogService.NoExistsByFromAsync(source.Select(x => x.FeedUrl).ToArray());
            // source = source.IntersectBy(noExists, x => x.FeedUrl).ToList();

            await Parallel.ForEachAsync(source, async (item, _) =>
            {
                var progress = (source.IndexOf(item) + 1m) / source.Count;
                await PublishArticle(item, progress);
            });
        }
        catch (Exception exception)
        {
            _logger.LogError(exception, "任务异常结束");
            return;
        }

        await UpdateViewTopAsync();
        await PushProgressMessage("CnBeta任务结束", 1m, MessageType.Over);
        _logger.LogInformation("CnBeta任务结束");
    }

    private async Task<List<string>> GetHomeSourceAsync()
    {
        var httpClient = _httpClientFactory.CreateClient("edge");
        var request = new HttpRequestMessage(HttpMethod.Get, "https://www.cnbeta.com.tw");
        string html;
        try
        {
            var response = await httpClient.SendAsync(request);
            html = await response.Content.ReadAsStringAsync();
        }
        catch (Exception e)
        {
            _logger.LogError(e, "get home source fail");
            return new List<string>();
        }

        if (string.IsNullOrEmpty(html)) return new List<string>();
        var context = BrowsingContext.New(Configuration.Default);
        var document = await context.OpenAsync(x => x.Content(html));

        var links = document.GetElementsByTagName("a");
        var feedUrls = links.Where(x =>
        {
            var href = x.GetAttribute("href");
            return !string.IsNullOrEmpty(href) && Regex.IsMatch(href, "\\/articles\\/.+/\\d+\\.htm");
        }).Select(x =>
        {
            var href = x.GetAttribute("href")!;
            href = href.StartsWith("//") ? "https:" + href : href;
            return href;
        }).ToList();

        var noExists = await _articleService.NoExistsByFromAsync(feedUrls);
        feedUrls = feedUrls.IntersectBy(noExists, x => x).ToList();
        return feedUrls;
    }


    /// <summary>
    /// 获取cnBeta新闻数据源
    /// </summary>
    /// <returns></returns>
    private async Task<List<RssSchema>> RssSource()
    {
        var httpClient = _httpClientFactory.CreateClient("edge");
        var request = new HttpRequestMessage(HttpMethod.Get, "https://www.cnbeta.com/backend.php");
        // var request = new RestRequest("https://www.cnbeta.com/backend.php", Method.GET);
        // var response = await _client.ExecuteGetAsync(request);
        var response = await httpClient.SendAsync(request);
        if (!response.IsSuccessStatusCode)
            return new List<RssSchema>();

        var content = await response.Content.ReadAsStringAsync();
        if (string.IsNullOrEmpty(content))
            return new List<RssSchema>();

        var parser = new RssParser();
        var rss = parser.Parse(content).Where(x =>
        {
            try
            {
                var uri = new Uri(x.FeedUrl);
                return uri.Host.EndsWith("cnbeta.com", StringComparison.CurrentCultureIgnoreCase);
            }
            catch (UriFormatException)
            {
                return false;
            }
        }).ToList();
        return rss;
    }

    /// <summary>
    /// 获取用户信息
    /// </summary>
    /// <param name="userName">用户名</param>
    /// <returns></returns>
    private VmUserInfo GetUserInfo(string userName)
    {
        return new()
        {
            Id = "cnBeta",
            Avatar = "https://cdn.dpangzi.com/images/cnbeta.png",
            Sex = Sex.Man,
            Sign = "cnBeta.COM - 简明IT新闻,网友媒体与言论平台",
            Name = userName
        };
    }

    /// <summary>
    /// 发布文章
    /// </summary>
    /// <param name="feedUrl"></param>
    /// <param name="progress">进度</param>
    /// <returns></returns>
    private async Task PublishArticle(string feedUrl, decimal progress)
    {
        var article = new VmCreateArticleV4
        {
            From = feedUrl,
            Tags = new() { "cnBeta" },
        };

        var author = await GetArticleContent(article, feedUrl, progress);
        if (author != null)
        {
            await _articleService.CreateArticleAsync(article, author);
        }
    }

    /// <summary>
    /// 获取到的html源码转成markdown
    /// </summary>
    /// <param name="html"></param>
    /// <returns></returns>
    private (string markdown, string html) HtmlToMarkdown(string html)
    {
        var config = new Config
        {
            UnknownTags = Config.UnknownTagsOption.Bypass,
            GithubFlavored = true,
            RemoveComments = true,
            SmartHrefHandling = true
        };
        var converter = new Converter(config);
        var markdown = converter.Convert(html);

        var htmlResult = Markdown.ToHtml(markdown);

        return (markdown, html: htmlResult);
    }

    /// <summary>
    /// 获取文章内容,并替换所有文章图片
    /// </summary>
    /// <param name="article"></param>
    /// <param name="url"></param>
    /// <param name="progress">进度</param>
    /// <returns></returns>
    private async Task<VmUserInfo?> GetArticleContent(VmCreateArticleV4 article, string url,
        decimal progress)
    {
        string html;
        try
        {
            //获取Html源码
            var httpClient = _httpClientFactory.CreateClient("edge");
            var request = new HttpRequestMessage(HttpMethod.Get, url);
            var response = await httpClient.SendAsync(request);
            if (!response.IsSuccessStatusCode)
            {
                _logger.LogError("get article fail,response status code:{StatusCode}", response.StatusCode);
                return null;
            }

            html = await response.Content.ReadAsStringAsync();
        }
        catch (Exception e)
        {
            await PushProgressMessage($"获取正文内容失败,{e.Message}", progress, MessageType.Error);
            _logger.LogError(e, "remote {Url} article content fail", url);
            return null;
        }

        if (string.IsNullOrEmpty(html)) return null;


        //解析Html源码,获取文章内容,查找文章所有图片,并上传到DB然后替换
        var context = BrowsingContext.New(Configuration.Default);
        var htmlDoc = await context.OpenAsync(x => x.Content(html));

        var title = htmlDoc.QuerySelector("header.title h1")?.TextContent;
        article.Title = title;
        await PushProgressMessage($"获取《{title}》正文内容", progress);

        article.Introduction =
            Regex.Replace(htmlDoc.QuerySelector("div.article-summary p")?.InnerHtml ?? "",
                @"(<\/?a.*?>)|(<\/?span.*?>)", "");


        article.PublishTime =
            DateTime.TryParse(htmlDoc.QuerySelector(".meta span")?.TextContent, out var publishDate)
                ? publishDate
                : DateTime.Now;

        var minDateTime = DateTime.Now.AddDays(-7);
        if (article.PublishTime < minDateTime)
        {
            _logger.LogInformation("发布时间晚于{MinDateTime},跳过", minDateTime);
            return null;
        }


        var articleBody = htmlDoc.GetElementById("artibody");
        if (articleBody == null) return null;

        #region remove

        var adBlock = articleBody.GetElementsByClassName("article-global");
        adBlock.ForEach(x => x.Remove());
        var relation = articleBody.GetElementsByClassName("article-relation");
        relation.ForEach(x => x.Remove());
        var topic = articleBody.GetElementsByClassName("article-topic");
        topic.ForEach(x => x.Remove());

        #endregion

        var images = articleBody.GetElementsByTagName("img");

        await Parallel.ForEachAsync(images, async (image, _) =>
        {
            await PushProgressMessage($"正在获取该文章第{(images.IndexOf(image) + 1)}张图片,并上传到数据库。", progress);
            var src = image.Attributes["src"];
            if (!string.IsNullOrEmpty(src?.Value))
            {
                var imageUrl = await DownloadImageToUpyunAsync(src.Value, progress);
                if (!string.IsNullOrEmpty(imageUrl))
                {
                    await PushProgressMessage($"image url :{imageUrl}", progress, MessageType.Info);
                    image.SetAttribute("src", imageUrl);
                }
                else
                {
                    image.Remove();
                }
            }
            else
            {
                image.Remove();
                await PushProgressMessage($"该文章第{(images.IndexOf(image) + 1)}张图片无明确资源地址,已剔除!", progress,
                    MessageType.Info);
            }
        });

        //删除cnBeta的iframe、video
        articleBody.GetElementsByTagName("iframe").ForEach(x => x.Remove());
        articleBody.GetElementsByTagName("video").ForEach(x => x.Remove());
        await PushProgressMessage("图片替换完成,正在发布文章", progress);

        var articleContent = Regex.Replace(articleBody.InnerHtml, @"(<\/?a.*?>)|(<\/?span.*?>)", "");
        var (markdown, htmlContent) = HtmlToMarkdown(articleContent);
        article.Markdown = markdown;
        article.Content = htmlContent;
        //获取文章作者信息
        var author = (htmlDoc.QuerySelector(".article-author")?.TextContent ?? "ugmbbc").Replace("责任编辑:", "");
        var userInfo = GetUserInfo(author);
        return userInfo;
    }

    private async Task<string?> DownloadImageToUpyunAsync(string url, decimal progress)
    {
        if (string.IsNullOrEmpty(url))
        {
            _logger.LogError("url is empty");
            return null;
        }

        if (!Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out var uri))
        {
            _logger.LogError("{Url}未能成功构建Uri", url);
            return null;
        }

        var tryNumber = 3;
        do
        {
            tryNumber--;

            var httpClient = _httpClientFactory.CreateClient("edge");
            var request = new HttpRequestMessage(HttpMethod.Get, uri);
            try
            {
                var response = await httpClient.SendAsync(request);
                if (!response.IsSuccessStatusCode)
                {
                    if (tryNumber > 0) continue;
                    _logger.LogError("download image fail,response status code:{StatusCode}", response.StatusCode);
                    return null;
                }

                var stream = await response.Content.ReadAsStreamAsync();
                var extension = Path.GetExtension(uri.AbsolutePath);
                var date = DateTime.Now;
#if DEBUG
                var path = new[]
                {
                    "Test", "images", "CnBeta", date.Year.ToString(), date.Month.ToString(), date.Day.ToString()
                };
                var result =
                    await _objectStorageService.UploadAsync(stream, path, $"{ObjectId.GenerateNewId()}{extension}");

#else
                    var result =
                        await _objectStorageService.UploadAsync(stream,
                            new[] { "images", date.Year.ToString(), date.Month.ToString(), date.Day.ToString() },
                            $"{ObjectId.GenerateNewId()}{extension}");
#endif
                return result.AccessUrl;
            }
            catch (Exception e)
            {
                if (tryNumber > 0) continue;
                var message =
                    $"downloading images from the {uri} failed,number of attempts remaining :{tryNumber}";
                await PushProgressMessage($"{message},{e.Message}", progress);
                _logger.LogError(e, "{Message}", message);
                return null;
            }
        } while (tryNumber > 0);

        return default;
    }


    public override object ActivateJob(Type jobType)
    {
        _logger.LogInformation("invoke activate job");
        return base.ActivateJob(jobType);
    }

    [ProlongExpirationTime]
    public async Task DeleteAsync()
    {
        await _articleService.DeleteOldCnBetaAsync(6, 50);
    }

    private async Task UpdateViewTopAsync()
    {
        var articleMinis = await _articleService.GetTopArticlesAsync();
        //await _cachingProvider.SetAsync(CacheKey.TopArticleKey, articleMinis, TimeSpan.FromHours(12));
    }
}
⚠⚠    以下内容为AI分析的结果,请根据实际情况进行判断。

上述代码是一个 C# 类 CnBetaTaskActivator,它是一个 Hangfire 任务激活器,主要用于从 cnBeta 网站抓取新闻文章并将其发布到某个系统中。以下是代码的主要功能和结构的详细解释:

1. 类的构造函数

  • 依赖注入:构造函数接受多个服务的实例,包括日志记录器、文章服务、对象存储服务、HTTP 客户端工厂、缓存提供者和推送消息服务。这些服务通过依赖注入的方式传入,以便在类中使用。

2. 主要方法

  • Start():这是任务的入口点,负责启动抓取和发布文章的过程。

    • 它首先记录任务开始的信息。
    • 然后调用 GetHomeSourceAsync() 方法获取 cnBeta 网站的文章链接。
    • 使用 Parallel.ForEachAsync 并行处理每个文章链接,调用 PublishArticle 方法发布文章。
    • 最后更新视图并推送任务结束的消息。
  • GetHomeSourceAsync():该方法负责从 cnBeta 网站获取主页的 HTML 内容,并解析出所有符合特定模式的文章链接。

    • 使用 HttpClient 发送 GET 请求获取 HTML。
    • 使用 AngleSharp 库解析 HTML,提取出所有符合 /articles/.+/\\d+\\.htm 正则表达式的链接。
    • 过滤掉已经存在的文章链接,返回新的文章链接列表。
  • PublishArticle():该方法负责创建文章对象并获取文章内容。

    • 它调用 GetArticleContent() 方法获取文章的详细内容和作者信息,并将其发布到文章服务中。
  • GetArticleContent():该方法获取指定文章的 HTML 内容,并解析出标题、简介、发布时间和正文。

    • 它还会处理文章中的图片,下载并上传到对象存储服务,并替换文章中的图片链接。
    • 使用 HtmlToMarkdown() 方法将 HTML 内容转换为 Markdown 格式。
  • HtmlToMarkdown():将 HTML 内容转换为 Markdown 格式,并返回转换后的 Markdown 和 HTML。

  • DownloadImageToUpyunAsync():该方法负责下载文章中的图片并上传到对象存储服务。

    • 它会尝试多次下载图片,直到成功或达到最大重试次数。
  • DeleteAsync():删除旧的 cnBeta 文章,保持数据库的整洁。

  • UpdateViewTopAsync():更新缓存中的热门文章列表。

3. 异常处理

  • 代码中有多个地方使用了异常处理,确保在抓取和处理过程中,如果发生错误,能够记录错误信息并推送相应的进度消息。

4. 日志记录

  • 使用 ILogger 记录任务的开始、结束、异常和其他重要信息,便于后续的调试和监控。

5. 进度推送

  • 使用 PushProgressMessage() 方法在任务执行过程中推送进度消息,提供给用户反馈。

总结

整体来看,CnBetaTaskActivator 类是一个用于自动化抓取 cnBeta 网站新闻文章的任务处理器,涉及到 HTTP 请求、HTML 解析、数据存储和消息推送等多个方面的功能。它通过并行处理提高了效率,并且通过日志和进度消息提供了良好的可监控性。

loading