HtmlAgilityPack Sample
通过html Table获取到内部数据,并执行去重.
HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb();
//加载本地文件 (之前是通过System.Net.Http.HttpClient post采集到的)
HtmlAgilityPack.HtmlDocument doc = hw.Load(dir + "2019-12-03.html");
//取rootNode
HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode;
//获取 Table/tbody
string xpath = "//*[@id=\"DDetail2\"]/tbody";
HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath);
//删除 tr之间的#Text子对象
foreach (var script in node.Descendants("#Text").ToArray())
script.Remove();
if (node.ChildNodes.Count>1)
{
List<dailyDetail> li = new List<dailyDetail>();
//node.ChildNodes.Count - 1 去除最后一个新建行
for (int i = 0; i < node.ChildNodes.Count - 1; i++)
{
//取子Node (相对 xpath)
var id = node.ChildNodes[i].SelectSingleNode($"td[1]/input[2]");
var text = node.ChildNodes[i].SelectSingleNode($"td[2]/input");
li.Add(new dailyDetail() { dailyDetailId= id.Attributes["value"].Value ,dailyContent= text.Attributes["value"].Value });
}
//找出重复值
var query = (from dd in li
where
dd.dailyContent != null
group dd by new
{
dd.dailyContent
} into g
where g.Count() > 1
select new
{
g.Key.dailyContent
}).ToList();
foreach (var item in query)
{
Console.WriteLine($"重复值:{item.dailyContent}");
Console.WriteLine($"首个Id:{li.FirstOrDefault(q=>q.dailyContent==item.dailyContent)?.dailyDetailId}");
}
}

浙公网安备 33010602011771号