网站首页 网站源码
using System.Text.RegularExpressions;
using AngleSharp;
using Dpz.Core.EnumLibrary;
using Dpz.Core.Infrastructure;
using Dpz.Core.Public.ViewModel;
using Dpz.Core.Public.ViewModel.V4;
using Dpz.Core.Service.Network;
using Dpz.Core.Service.ObjectStorage.Services;
using Dpz.Core.Service.V4.Services;
using Dpz.Core.Web.Jobs.Services;
using JetBrains.Annotations;
using MediatR;
using Microsoft.Toolkit.Parsers.Rss;
using IConfiguration = Microsoft.Extensions.Configuration.IConfiguration;
namespace Dpz.Core.Web.Jobs.Hangfire;
[UsedImplicitly]
public sealed class ItHomeActivator(
IArticleService articleService,
IObjectStorageOperation objectStorageService,
IHttpClientFactory httpClientFactory,
IPushMessage pushMessage,
ILoggerFactory logger,
IMediator mediator,
AnalyzeService analyzeService,
IConfiguration configuration
)
: TaskService(
articleService,
objectStorageService,
httpClientFactory,
pushMessage,
logger,
mediator,
["ItHome"],
analyzeService,
configuration
)
{
protected override async Task<IReadOnlyCollection<string>> GetTaskUrlsAsync()
{
var content = await ApplicationTools.RetryAsync(
async () =>
{
var httpClient = HttpClientFactory.CreateClient("edge");
var request = new HttpRequestMessage(HttpMethod.Get, "https://www.ithome.com/rss/");
var response = await httpClient.SendAsync(request);
if (!response.IsSuccessStatusCode)
return "";
return await response.Content.ReadAsStringAsync();
},
TimeSpan.FromSeconds(1)
);
if (string.IsNullOrEmpty(content))
return new List<string>();
var parser = new RssParser();
var feedUrls = parser
.Parse(content)
// 讨厌手游,所以屏蔽
.Where(x => !x.Title.Contains("手游"))
.Select(x => x.FeedUrl)
.ToList();
var noExists = await ArticleService.NoExistsByFromAsync(feedUrls);
feedUrls = feedUrls.IntersectBy(noExists, x => x).ToList();
return feedUrls;
}
protected override VmUserInfo GetUserInfo(string userName)
{
return new()
{
Id = "itHome",
Avatar = "https://cdn.dpangzi.com/images/ItHome.svg",
Sex = Sex.Man,
Sign = "cnBeta.COM - IT之家,青岛软媒旗下的前沿科技门户网站。",
Name = userName,
};
}
protected override async Task<VmUserInfo?> GetArticleContentAsync(
VmCreateArticleV4 article,
string url
)
{
var html = await ApplicationTools.RetryAsync(
async () =>
{
var httpClient = HttpClientFactory.CreateClient("edge");
var request = new HttpRequestMessage(HttpMethod.Get, url);
var response = await httpClient.SendAsync(request);
if (!response.IsSuccessStatusCode)
{
Logger.LogError(
"get article fail,response status code:{StatusCode}",
response.StatusCode
);
return null;
}
return await response.Content.ReadAsStringAsync();
},
TimeSpan.FromSeconds(1)
);
if (string.IsNullOrEmpty(html))
return null;
// 解析Html源码,获取文章内容,查找文章所有图片,并上传到DB然后替换
var context = BrowsingContext.New(Configuration.Default);
var htmlDoc = await context.OpenAsync(x => x.Content(html));
#region 过滤规则
VmUserInfo? AdContent(string reason)
{
Logger.LogInformation("文章:{Url}{Reason},跳过", url, reason);
return null;
}
var ad1 = htmlDoc.QuerySelector("#paragraph > div.tagging1");
var ad2 = htmlDoc.QuerySelector("#paragraph > dir");
if (ad1 != null || ad2 != null)
{
return AdContent("疑似广告");
}
#endregion
// 正文
var articleBody = htmlDoc.QuerySelector("#paragraph");
if (articleBody == null)
return null;
#region 正文移除不相关内容
var remove1 = articleBody.QuerySelector("#paragraph > div");
remove1?.Remove();
articleBody.GetElementsByTagName("iframe").ForEach(x => x.Remove());
articleBody.GetElementsByTagName("video").ForEach(x => x.Remove());
articleBody
.GetElementsByTagName("a")
.ForEach(x =>
{
var href = x.GetAttribute("href");
if (
!string.IsNullOrEmpty(href)
&& Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out var uri)
&& uri.Host.Equals("www.ithome.com", StringComparison.OrdinalIgnoreCase)
&& x.TextContent.Trim() == "IT之家"
)
{
x.Remove();
}
});
articleBody
.GetElementsByTagName("p")
.ForEach(x =>
{
var other = x.GetAttribute("class");
if (!string.IsNullOrEmpty(other))
x.Remove();
});
#endregion
// 标题
var title = htmlDoc.QuerySelector("#dt > div.fl.content > h1")?.TextContent;
article.Title = title;
await PushProgressMessage($"获取《{title}》正文内容");
// 简介 取第一段
article.Introduction = htmlDoc.QuerySelector("#paragraph > p")?.TextContent;
// 发布时间
article.PublishTime = DateTime.TryParse(
htmlDoc.QuerySelector("#pubtime_baidu")?.TextContent,
out var publishDate
)
? publishDate
: DateTime.Now;
var minDateTime = DateTime.Now.AddDays(-7);
if (article.PublishTime < minDateTime)
{
Logger.LogInformation("发布时间晚于{MinDateTime},跳过", minDateTime);
return null;
}
var images = articleBody.GetElementsByTagName("img");
await Parallel.ForEachAsync(
images,
async (image, _) =>
{
await PushProgressMessage(
$"正在获取该文章第{(images.IndexOf(image) + 1)}张图片,并上传到数据库。"
);
var src = image.Attributes["data-original"];
if (!string.IsNullOrEmpty(src?.Value))
{
var imageUrl = await DownloadImageToUpyunAsync(src.Value);
if (!string.IsNullOrEmpty(imageUrl))
{
await PushProgressMessage($"image url :{imageUrl}", MessageType.Info);
image.SetAttribute("src", imageUrl);
}
else
{
image.Remove();
}
}
else
{
image.Remove();
await PushProgressMessage(
$"该文章第{(images.IndexOf(image) + 1)}张图片无明确资源地址,已剔除!",
MessageType.Info
);
}
}
);
await PushProgressMessage("图片替换完成,正在发布文章");
var articleContent = Regex.Replace(
articleBody.InnerHtml,
@"(<\/?a.*?>)|(<\/?span.*?>)",
""
);
var (markdown, htmlContent) = HtmlToMarkdown(articleContent);
article.Markdown = markdown;
article.Content = htmlContent;
// 获取文章作者信息
var author = htmlDoc.QuerySelector("#author_baidu > strong")?.TextContent ?? "ItHome";
var userInfo = GetUserInfo(author);
return userInfo;
}
}
这段代码是一个 C# 类 ItHomeActivator
的实现,主要用于从 IT 之家网站抓取 RSS 源中的文章,并处理这些文章的内容。它使用了多个库和服务,包括 AngleSharp
用于 HTML 解析,MediatR
用于消息传递,HttpClient
用于网络请求等。以下是代码的主要功能和结构的详细解释:
获取 RSS 源:
GetTaskUrlsAsync
方法从 IT 之家网站的 RSS 源 (https://www.ithome.com/rss/
) 获取内容。HttpClient
发送 GET 请求,并解析返回的 RSS 内容。获取用户信息:
GetUserInfo
方法根据提供的用户名返回一个 VmUserInfo
对象,包含用户的 ID、头像、性别、签名和名称。获取文章内容:
GetArticleContentAsync
方法根据文章的 URL 获取文章的 HTML 内容。构造函数:
TaskService
。方法:
GetTaskUrlsAsync
: 获取 RSS 源并返回文章 URL 列表。GetUserInfo
: 返回用户信息。GetArticleContentAsync
: 获取文章内容并处理。HTML 解析:
AngleSharp
库解析 HTML 文档,提取所需的信息。并行处理:
Parallel.ForEachAsync
并行处理文章中的图片,下载并上传到对象存储。内容清理:
整体而言,这段代码实现了一个自动化的文章抓取和处理服务,能够从 IT 之家网站获取最新的技术文章,提取和清理内容,并将其存储到数据库中。它结合了网络请求、HTML 解析、并行处理和数据存储等多个技术,展示了现代 C# 开发中的一些常见模式和实践。