src/Dpz.Core.Web.Jobs/Hangfire/ItHomeActivator.cs 网站源码

using System.Text.RegularExpressions; using AngleSharp; using Dpz.Core.EnumLibrary; using Dpz.Core.Infrastructure; using Dpz.Core.Public.ViewModel; using Dpz.Core.Public.ViewModel.V4; using Dpz.Core.Service.Network; using Dpz.Core.Service.ObjectStorage.Services; using Dpz.Core.Service.V4.Services; using Dpz.Core.Web.Jobs.Services; using JetBrains.Annotations; using MediatR; using Microsoft.Toolkit.Parsers.Rss; using IConfiguration = Microsoft.Extensions.Configuration.IConfiguration; namespace Dpz.Core.Web.Jobs.Hangfire; [UsedImplicitly] public sealed class ItHomeActivator( IArticleService articleService, IObjectStorageOperation objectStorageService, IHttpClientFactory httpClientFactory, IPushMessage pushMessage, ILoggerFactory logger, IMediator mediator, AnalyzeService analyzeService, IConfiguration configuration ) : TaskService( articleService, objectStorageService, httpClientFactory, pushMessage, logger, mediator, ["ItHome"], analyzeService, configuration ) { protected override async Task<IReadOnlyCollection<string>> GetTaskUrlsAsync() { var content = await ApplicationTools.RetryAsync( async () => { var httpClient = HttpClientFactory.CreateClient("edge"); var request = new HttpRequestMessage(HttpMethod.Get, "https://www.ithome.com/rss/"); var response = await httpClient.SendAsync(request); if (!response.IsSuccessStatusCode) return ""; return await response.Content.ReadAsStringAsync(); }, TimeSpan.FromSeconds(1) ); if (string.IsNullOrEmpty(content)) return new List<string>(); var parser = new RssParser(); var feedUrls = parser .Parse(content) // 讨厌手游，所以屏蔽 .Where(x => !x.Title.Contains("手游")) .Select(x => x.FeedUrl) .ToList(); var noExists = await ArticleService.NoExistsByFromAsync(feedUrls); feedUrls = feedUrls.IntersectBy(noExists, x => x).ToList(); return feedUrls; } protected override VmUserInfo GetUserInfo(string userName) { return new() { Id = "itHome", Avatar = "https://cdn.dpangzi.com/images/ItHome.svg", Sex = Sex.Man, Sign = "cnBeta.COM - IT之家，青岛软媒旗下的前沿科技门户网站。", Name = userName, }; } protected override async Task<VmUserInfo?> GetArticleContentAsync( VmCreateArticleV4 article, string url ) { var html = await ApplicationTools.RetryAsync( async () => { var httpClient = HttpClientFactory.CreateClient("edge"); var request = new HttpRequestMessage(HttpMethod.Get, url); var response = await httpClient.SendAsync(request); if (!response.IsSuccessStatusCode) { Logger.LogError( "get article fail,response status code:{StatusCode}", response.StatusCode ); return null; } return await response.Content.ReadAsStringAsync(); }, TimeSpan.FromSeconds(1) ); if (string.IsNullOrEmpty(html)) return null; // 解析Html源码，获取文章内容，查找文章所有图片，并上传到DB然后替换 var context = BrowsingContext.New(Configuration.Default); var htmlDoc = await context.OpenAsync(x => x.Content(html)); #region 过滤规则 VmUserInfo? AdContent(string reason) { Logger.LogInformation("文章：{Url}{Reason}，跳过", url, reason); return null; } var ad1 = htmlDoc.QuerySelector("#paragraph > div.tagging1"); var ad2 = htmlDoc.QuerySelector("#paragraph > dir"); if (ad1 != null || ad2 != null) { return AdContent("疑似广告"); } #endregion // 正文 var articleBody = htmlDoc.QuerySelector("#paragraph"); if (articleBody == null) return null; #region 正文移除不相关内容 var remove1 = articleBody.QuerySelector("#paragraph > div"); remove1?.Remove(); articleBody.GetElementsByTagName("iframe").ForEach(x => x.Remove()); articleBody.GetElementsByTagName("video").ForEach(x => x.Remove()); articleBody .GetElementsByTagName("a") .ForEach(x => { var href = x.GetAttribute("href"); if ( !string.IsNullOrEmpty(href) && Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out var uri) && uri.Host.Equals("www.ithome.com", StringComparison.OrdinalIgnoreCase) && x.TextContent.Trim() == "IT之家" ) { x.Remove(); } }); articleBody .GetElementsByTagName("p") .ForEach(x => { var other = x.GetAttribute("class"); if (!string.IsNullOrEmpty(other)) x.Remove(); }); #endregion // 标题 var title = htmlDoc.QuerySelector("#dt > div.fl.content > h1")?.TextContent; article.Title = title; await PushProgressMessage($"获取《{title}》正文内容"); // 简介取第一段 article.Introduction = htmlDoc.QuerySelector("#paragraph > p")?.TextContent; // 发布时间 article.PublishTime = DateTime.TryParse( htmlDoc.QuerySelector("#pubtime_baidu")?.TextContent, out var publishDate ) ? publishDate : DateTime.Now; var minDateTime = DateTime.Now.AddDays(-7); if (article.PublishTime < minDateTime) { Logger.LogInformation("发布时间晚于{MinDateTime},跳过", minDateTime); return null; } var images = articleBody.GetElementsByTagName("img"); await Parallel.ForEachAsync( images, async (image, _) => { await PushProgressMessage( $"正在获取该文章第{(images.IndexOf(image) + 1)}张图片，并上传到数据库。" ); var src = image.Attributes["data-original"]; if (!string.IsNullOrEmpty(src?.Value)) { var imageUrl = await DownloadImageToUpyunAsync(src.Value); if (!string.IsNullOrEmpty(imageUrl)) { await PushProgressMessage($"image url :{imageUrl}", MessageType.Info); image.SetAttribute("src", imageUrl); } else { image.Remove(); } } else { image.Remove(); await PushProgressMessage( $"该文章第{(images.IndexOf(image) + 1)}张图片无明确资源地址，已剔除！", MessageType.Info ); } } ); await PushProgressMessage("图片替换完成，正在发布文章"); var articleContent = Regex.Replace( articleBody.InnerHtml, @"(<\/?a.*?>)|(<\/?span.*?>)", "" ); var (markdown, htmlContent) = HtmlToMarkdown(articleContent); article.Markdown = markdown; article.Content = htmlContent; // 获取文章作者信息 var author = htmlDoc.QuerySelector("#author_baidu > strong")?.TextContent ?? "ItHome"; var userInfo = GetUserInfo(author); return userInfo; } }

这段代码是一个 C# 类 ItHomeActivator 的实现，主要用于从 IT 之家网站抓取 RSS 源中的文章，并处理这些文章的内容。它使用了多个库和服务，包括 AngleSharp 用于 HTML 解析，MediatR 用于消息传递，HttpClient 用于网络请求等。以下是代码的主要功能和结构的详细解释：

主要功能

获取 RSS 源:
- GetTaskUrlsAsync 方法从 IT 之家网站的 RSS 源 (https://www.ithome.com/rss/) 获取内容。
- 使用 HttpClient 发送 GET 请求，并解析返回的 RSS 内容。
- 过滤掉标题中包含“手游”的文章，并检查哪些文章尚未存在于数据库中。
- 返回符合条件的文章 URL 列表。
获取用户信息:
- GetUserInfo 方法根据提供的用户名返回一个 VmUserInfo 对象，包含用户的 ID、头像、性别、签名和名称。
获取文章内容:
- GetArticleContentAsync 方法根据文章的 URL 获取文章的 HTML 内容。
- 解析 HTML，提取文章的标题、简介、发布时间和正文内容。
- 过滤掉广告和不相关的内容，例如 iframe、视频和特定的链接。
- 处理文章中的图片，下载并上传到对象存储服务（如 Upyun），并替换 HTML 中的图片链接。
- 将文章内容转换为 Markdown 格式，并返回文章的作者信息。

代码结构

构造函数:
- 接受多个服务和配置参数，初始化基类 TaskService。
方法:
- GetTaskUrlsAsync: 获取 RSS 源并返回文章 URL 列表。
- GetUserInfo: 返回用户信息。
- GetArticleContentAsync: 获取文章内容并处理。
HTML 解析:
- 使用 AngleSharp 库解析 HTML 文档，提取所需的信息。
- 通过 CSS 选择器查找特定的 HTML 元素。
并行处理:
- 使用 Parallel.ForEachAsync 并行处理文章中的图片，下载并上传到对象存储。
内容清理:
- 使用正则表达式清理 HTML 内容，去除不必要的标签。

总结

整体而言，这段代码实现了一个自动化的文章抓取和处理服务，能够从 IT 之家网站获取最新的技术文章，提取和清理内容，并将其存储到数据库中。它结合了网络请求、HTML 解析、并行处理和数据存储等多个技术，展示了现代 C# 开发中的一些常见模式和实践。