网站首页 网站源码
using System.Text.RegularExpressions;
using AngleSharp;
using Dpz.Core.EnumLibrary;
using Dpz.Core.Infrastructure;
using Dpz.Core.Public.ViewModel;
using Dpz.Core.Public.ViewModel.V4;
using Dpz.Core.Service.ObjectStorage.Services;
using Dpz.Core.Service.V4.Services;
using Dpz.Core.Web.Jobs.Services;
using Hangfire;
using Markdig;
using Microsoft.Toolkit.Parsers.Rss;
using MongoDB.Bson;
using ReverseMarkdown;
namespace Dpz.Core.Web.Jobs.Hangfire;
public class CnBetaTaskActivator : JobActivator
{
/// <summary>
/// 日志记录
/// </summary>
private readonly ILogger<CnBetaTaskActivator> _logger;
/// <summary>
/// 文章Service
/// </summary>
private readonly IArticleService _articleService;
private readonly IObjectStorageOperation _objectStorageService;
private readonly IHttpClientFactory _httpClientFactory;
private readonly IPushMessage _pushMessage;
public CnBetaTaskActivator(
ILogger<CnBetaTaskActivator> logger,
IArticleService articleService,
IObjectStorageOperation objectStorageService,
IHttpClientFactory httpClientFactory,
IPushMessage pushMessage)
{
_logger = logger;
_articleService = articleService;
_objectStorageService = objectStorageService;
_httpClientFactory = httpClientFactory;
_pushMessage = pushMessage;
}
private async Task PushProgressMessage(string message, decimal progress, MessageType type = MessageType.Success)
{
var progressMessage = new ProgressMessage
{
ProgressValues = new[] { progress },
Message = message,
Type = type
};
await _pushMessage.PushCnBetaMessageAsync(progressMessage);
}
[ProlongExpirationTime]
public async Task Start()
{
_logger.LogInformation("CnBeta任务开始");
try
{
var source = (await GetHomeSourceAsync()); //.OrderBy(_ => Guid.NewGuid()).Take(1).ToList();
_logger.LogInformation("获取到{Count}条数据", source.Count);
// var noExists = await _blogService.NoExistsByFromAsync(source.Select(x => x.FeedUrl).ToArray());
// source = source.IntersectBy(noExists, x => x.FeedUrl).ToList();
await Parallel.ForEachAsync(source, async (item, _) =>
{
var progress = (source.IndexOf(item) + 1m) / source.Count;
await PublishArticle(item, progress);
});
}
catch (Exception exception)
{
_logger.LogError(exception, "任务异常结束");
return;
}
await UpdateViewTopAsync();
await PushProgressMessage("CnBeta任务结束", 1m, MessageType.Over);
_logger.LogInformation("CnBeta任务结束");
}
private async Task<List<string>> GetHomeSourceAsync()
{
var httpClient = _httpClientFactory.CreateClient("edge");
var request = new HttpRequestMessage(HttpMethod.Get, "https://www.cnbeta.com.tw");
string html;
try
{
var response = await httpClient.SendAsync(request);
html = await response.Content.ReadAsStringAsync();
}
catch (Exception e)
{
_logger.LogError(e, "get home source fail");
return new List<string>();
}
if (string.IsNullOrEmpty(html)) return new List<string>();
var context = BrowsingContext.New(Configuration.Default);
var document = await context.OpenAsync(x => x.Content(html));
var links = document.GetElementsByTagName("a");
var feedUrls = links.Where(x =>
{
var href = x.GetAttribute("href");
return !string.IsNullOrEmpty(href) && Regex.IsMatch(href, "\\/articles\\/.+/\\d+\\.htm");
}).Select(x =>
{
var href = x.GetAttribute("href")!;
href = href.StartsWith("//") ? "https:" + href : href;
return href;
}).ToList();
var noExists = await _articleService.NoExistsByFromAsync(feedUrls);
feedUrls = feedUrls.IntersectBy(noExists, x => x).ToList();
return feedUrls;
}
/// <summary>
/// 获取cnBeta新闻数据源
/// </summary>
/// <returns></returns>
private async Task<List<RssSchema>> RssSource()
{
var httpClient = _httpClientFactory.CreateClient("edge");
var request = new HttpRequestMessage(HttpMethod.Get, "https://www.cnbeta.com/backend.php");
// var request = new RestRequest("https://www.cnbeta.com/backend.php", Method.GET);
// var response = await _client.ExecuteGetAsync(request);
var response = await httpClient.SendAsync(request);
if (!response.IsSuccessStatusCode)
return new List<RssSchema>();
var content = await response.Content.ReadAsStringAsync();
if (string.IsNullOrEmpty(content))
return new List<RssSchema>();
var parser = new RssParser();
var rss = parser.Parse(content).Where(x =>
{
try
{
var uri = new Uri(x.FeedUrl);
return uri.Host.EndsWith("cnbeta.com", StringComparison.CurrentCultureIgnoreCase);
}
catch (UriFormatException)
{
return false;
}
}).ToList();
return rss;
}
/// <summary>
/// 获取用户信息
/// </summary>
/// <param name="userName">用户名</param>
/// <returns></returns>
private VmUserInfo GetUserInfo(string userName)
{
return new()
{
Id = "cnBeta",
Avatar = "https://cdn.dpangzi.com/images/cnbeta.png",
Sex = Sex.Man,
Sign = "cnBeta.COM - 简明IT新闻,网友媒体与言论平台",
Name = userName
};
}
/// <summary>
/// 发布文章
/// </summary>
/// <param name="feedUrl"></param>
/// <param name="progress">进度</param>
/// <returns></returns>
private async Task PublishArticle(string feedUrl, decimal progress)
{
var article = new VmCreateArticleV4
{
From = feedUrl,
Tags = new() { "cnBeta" },
};
var author = await GetArticleContent(article, feedUrl, progress);
if (author != null)
{
await _articleService.CreateArticleAsync(article, author);
}
}
/// <summary>
/// 获取到的html源码转成markdown
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private (string markdown, string html) HtmlToMarkdown(string html)
{
var config = new Config
{
UnknownTags = Config.UnknownTagsOption.Bypass,
GithubFlavored = true,
RemoveComments = true,
SmartHrefHandling = true
};
var converter = new Converter(config);
var markdown = converter.Convert(html);
var htmlResult = Markdown.ToHtml(markdown);
return (markdown, html: htmlResult);
}
/// <summary>
/// 获取文章内容,并替换所有文章图片
/// </summary>
/// <param name="article"></param>
/// <param name="url"></param>
/// <param name="progress">进度</param>
/// <returns></returns>
private async Task<VmUserInfo?> GetArticleContent(VmCreateArticleV4 article, string url,
decimal progress)
{
string html;
try
{
//获取Html源码
var httpClient = _httpClientFactory.CreateClient("edge");
var request = new HttpRequestMessage(HttpMethod.Get, url);
var response = await httpClient.SendAsync(request);
if (!response.IsSuccessStatusCode)
{
_logger.LogError("get article fail,response status code:{StatusCode}", response.StatusCode);
return null;
}
html = await response.Content.ReadAsStringAsync();
}
catch (Exception e)
{
await PushProgressMessage($"获取正文内容失败,{e.Message}", progress, MessageType.Error);
_logger.LogError(e, "remote {Url} article content fail", url);
return null;
}
if (string.IsNullOrEmpty(html)) return null;
//解析Html源码,获取文章内容,查找文章所有图片,并上传到DB然后替换
var context = BrowsingContext.New(Configuration.Default);
var htmlDoc = await context.OpenAsync(x => x.Content(html));
var title = htmlDoc.QuerySelector("header.title h1")?.TextContent;
article.Title = title;
await PushProgressMessage($"获取《{title}》正文内容", progress);
article.Introduction =
Regex.Replace(htmlDoc.QuerySelector("div.article-summary p")?.InnerHtml ?? "",
@"(<\/?a.*?>)|(<\/?span.*?>)", "");
article.PublishTime =
DateTime.TryParse(htmlDoc.QuerySelector(".meta span")?.TextContent, out var publishDate)
? publishDate
: DateTime.Now;
var minDateTime = DateTime.Now.AddDays(-7);
if (article.PublishTime < minDateTime)
{
_logger.LogInformation("发布时间晚于{MinDateTime},跳过", minDateTime);
return null;
}
var articleBody = htmlDoc.GetElementById("artibody");
if (articleBody == null) return null;
#region remove
var adBlock = articleBody.GetElementsByClassName("article-global");
adBlock.ForEach(x => x.Remove());
var relation = articleBody.GetElementsByClassName("article-relation");
relation.ForEach(x => x.Remove());
var topic = articleBody.GetElementsByClassName("article-topic");
topic.ForEach(x => x.Remove());
#endregion
var images = articleBody.GetElementsByTagName("img");
await Parallel.ForEachAsync(images, async (image, _) =>
{
await PushProgressMessage($"正在获取该文章第{(images.IndexOf(image) + 1)}张图片,并上传到数据库。", progress);
var src = image.Attributes["src"];
if (!string.IsNullOrEmpty(src?.Value))
{
var imageUrl = await DownloadImageToUpyunAsync(src.Value, progress);
if (!string.IsNullOrEmpty(imageUrl))
{
await PushProgressMessage($"image url :{imageUrl}", progress, MessageType.Info);
image.SetAttribute("src", imageUrl);
}
else
{
image.Remove();
}
}
else
{
image.Remove();
await PushProgressMessage($"该文章第{(images.IndexOf(image) + 1)}张图片无明确资源地址,已剔除!", progress,
MessageType.Info);
}
});
//删除cnBeta的iframe、video
articleBody.GetElementsByTagName("iframe").ForEach(x => x.Remove());
articleBody.GetElementsByTagName("video").ForEach(x => x.Remove());
await PushProgressMessage("图片替换完成,正在发布文章", progress);
var articleContent = Regex.Replace(articleBody.InnerHtml, @"(<\/?a.*?>)|(<\/?span.*?>)", "");
var (markdown, htmlContent) = HtmlToMarkdown(articleContent);
article.Markdown = markdown;
article.Content = htmlContent;
//获取文章作者信息
var author = (htmlDoc.QuerySelector(".article-author")?.TextContent ?? "ugmbbc").Replace("责任编辑:", "");
var userInfo = GetUserInfo(author);
return userInfo;
}
private async Task<string?> DownloadImageToUpyunAsync(string url, decimal progress)
{
if (string.IsNullOrEmpty(url))
{
_logger.LogError("url is empty");
return null;
}
if (!Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out var uri))
{
_logger.LogError("{Url}未能成功构建Uri", url);
return null;
}
var tryNumber = 3;
do
{
tryNumber--;
var httpClient = _httpClientFactory.CreateClient("edge");
var request = new HttpRequestMessage(HttpMethod.Get, uri);
try
{
var response = await httpClient.SendAsync(request);
if (!response.IsSuccessStatusCode)
{
if (tryNumber > 0) continue;
_logger.LogError("download image fail,response status code:{StatusCode}", response.StatusCode);
return null;
}
var stream = await response.Content.ReadAsStreamAsync();
var extension = Path.GetExtension(uri.AbsolutePath);
var date = DateTime.Now;
#if DEBUG
var path = new[]
{
"Test", "images", "CnBeta", date.Year.ToString(), date.Month.ToString(), date.Day.ToString()
};
var result =
await _objectStorageService.UploadAsync(stream, path, $"{ObjectId.GenerateNewId()}{extension}");
#else
var result =
await _objectStorageService.UploadAsync(stream,
new[] { "images", date.Year.ToString(), date.Month.ToString(), date.Day.ToString() },
$"{ObjectId.GenerateNewId()}{extension}");
#endif
return result.AccessUrl;
}
catch (Exception e)
{
if (tryNumber > 0) continue;
var message =
$"downloading images from the {uri} failed,number of attempts remaining :{tryNumber}";
await PushProgressMessage($"{message},{e.Message}", progress);
_logger.LogError(e, "{Message}", message);
return null;
}
} while (tryNumber > 0);
return default;
}
public override object ActivateJob(Type jobType)
{
_logger.LogInformation("invoke activate job");
return base.ActivateJob(jobType);
}
[ProlongExpirationTime]
public async Task DeleteAsync()
{
await _articleService.DeleteOldCnBetaAsync(6, 50);
}
private async Task UpdateViewTopAsync()
{
var articleMinis = await _articleService.GetTopArticlesAsync();
//await _cachingProvider.SetAsync(CacheKey.TopArticleKey, articleMinis, TimeSpan.FromHours(12));
}
}
上述代码是一个 C# 类 CnBetaTaskActivator
,它是一个 Hangfire 任务激活器,主要用于从 cnBeta 网站抓取新闻文章并将其发布到某个系统中。以下是代码的主要功能和结构的详细解释:
Start():这是任务的入口点,负责启动抓取和发布文章的过程。
GetHomeSourceAsync()
方法获取 cnBeta 网站的文章链接。Parallel.ForEachAsync
并行处理每个文章链接,调用 PublishArticle
方法发布文章。GetHomeSourceAsync():该方法负责从 cnBeta 网站获取主页的 HTML 内容,并解析出所有符合特定模式的文章链接。
HttpClient
发送 GET 请求获取 HTML。/articles/.+/\\d+\\.htm
正则表达式的链接。PublishArticle():该方法负责创建文章对象并获取文章内容。
GetArticleContent()
方法获取文章的详细内容和作者信息,并将其发布到文章服务中。GetArticleContent():该方法获取指定文章的 HTML 内容,并解析出标题、简介、发布时间和正文。
HtmlToMarkdown()
方法将 HTML 内容转换为 Markdown 格式。HtmlToMarkdown():将 HTML 内容转换为 Markdown 格式,并返回转换后的 Markdown 和 HTML。
DownloadImageToUpyunAsync():该方法负责下载文章中的图片并上传到对象存储服务。
DeleteAsync():删除旧的 cnBeta 文章,保持数据库的整洁。
UpdateViewTopAsync():更新缓存中的热门文章列表。
ILogger
记录任务的开始、结束、异常和其他重要信息,便于后续的调试和监控。PushProgressMessage()
方法在任务执行过程中推送进度消息,提供给用户反馈。整体来看,CnBetaTaskActivator
类是一个用于自动化抓取 cnBeta 网站新闻文章的任务处理器,涉及到 HTTP 请求、HTML 解析、数据存储和消息推送等多个方面的功能。它通过并行处理提高了效率,并且通过日志和进度消息提供了良好的可监控性。