DotnetSpider5 爬博客园新闻
只要是爬虫必须爬一下博客园.不知道为什么反正都这样..就跟hello world一样吧
DotnetSpider 是非常优秀的爬虫框架.无论扩展性 易用性 可读性. 已经跳进作者的坑4次了..DotnetSpider 现在版本是5 我是从2开始用的 最近打算跳入新坑
版本5的文档 https://github.com/dotnetcore/DotnetSpider/wiki
爬博客园其实作者是提供了Sample 不过比较简单
我这边为了跳新坑 重新改了下 对接了mysql
public class CnblogsSpider : Spider
{
public static async Task RunAsync()
{
var builder = Builder.CreateDefaultBuilder<CnblogsSpider>();
builder.UseSerilog();
builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
await builder.Build().RunAsync();
}
public CnblogsSpider(IOptions<SpiderOptions> options,
SpiderServices services,
ILogger<Spider> logger) : base(
options, services, logger)
{
}
protected override async Task InitializeAsync(CancellationToken stoppingToken)
{
await AddRequestsAsync(new Request("https://news.cnblogs.com/n/666228/"));
await AddRequestsAsync(new Request("https://news.cnblogs.com/n/page/1/"));
AddDataFlow(new ListNewsParser());
AddDataFlow(new MysqlNewStorage());
}
protected override (string Id, string Name) GetIdAndName()
{
return (Guid.NewGuid().ToString(), "cnblogs");
}
protected class MysqlNewStorage : StorageBase
{
public override async Task InitAsync()
{
await using var conn = new MySqlConnection(AppConfig.Configuration.GetConnectionString("Default"));
//await conn.ExecuteAsync("create database if not exists cnblogs2;");
await conn.ExecuteAsync($@"
create table if not exists article
(
id int auto_increment
primary key,
title varchar(500) not null,
sContent varchar(2000) null
);
");
}
protected override async Task StoreAsync(DataContext context)
{
var typeName = typeof(Article).FullName;
var data = (Article)context.GetData(typeName);
if (data != null && data is Article news)
{
await using var conn =
new MySqlConnection(AppConfig.Configuration.GetConnectionString("Default"));
var icount = conn.Query<int>($"SELECT count(id) FROM article WHERE title = '{data.Title}'").FirstOrDefault();
if (icount <= 0)
{
await conn.ExecuteAsync(
$"INSERT IGNORE INTO article (title, sContent) VALUES (@Title,@SContent);",
data);
}
}
}
}
protected class ListNewsParser : DataParser
{
public ListNewsParser()
{
// AddRequiredValidator("news\\.cnblogs\\.com/n/page");
AddRequiredValidator(request =>
{
return Regex.IsMatch(request.RequestUri.ToString(), "news.cnblogs.com");
});
AddFollowRequestQuerier(Selectors.XPath("."));
//AddRequiredValidator("cnblogs.com");
// if you want to collect every pages
// AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']"));
}
protected override Task Parse(DataContext context)
{
//var newsList = context.Selectable.SelectList(Selectors.XPath(".//div[@class='news_block']"));
//if (newsList != null)
//{
// foreach (var news in newsList)
// {
// var title = news.Select(Selectors.XPath(".//h2[@class='news_entry']"))?.Value;
// var url = news.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href"))?.Value;
// //var summary = news.Select(Selectors.XPath(".//div[@class='entry_summary']"))?.Value;
// //var views = news.Select(Selectors.XPath(".//span[@class='view']"))?.Value.Replace(" 人浏览", "");
// if (!string.IsNullOrWhiteSpace(url))
// {
// var request = context.CreateNewRequest(url);
// //request.SetProperty("title", title);
// //request.SetProperty("url", url);
// //request.SetProperty("summary", summary);
// //request.SetProperty("views", views);
// context.AddFollowRequests(request);
// }
// }
//}
//var request = context.CreateNewRequest("http://baidu.com//");
//context.AddFollowRequests(request);
var news_content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_main']"));
if (news_content != null)
{
var title = news_content.Select(Selectors.XPath(".//div[@id='news_title']"))?.Value;
var content = news_content.Select(Selectors.XPath(".//div[@id='news_content']"))?.Value;
var typeName = typeof(Article).FullName;
context.AddData(typeName,
new Article
{
Title = title.Trim(),
SContent = content.Trim(),
//Summary = context.Request.Properties["summary"]?.Trim(),
//Views = int.Parse(context.Request.Properties["views"]),
//Content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_body']")).Value?.Trim()
}
);
}
return Task.CompletedTask;
}
}
public class Article
{
public string Title { get; set; }
public string SContent { get; set; }
}
}


源码(https://files.cnblogs.com/files/leoxjy/ConsoleDotnetSpider5Sample.zip)

浙公网安备 33010602011771号