NET 爬虫
最近经常听说或者接触关于网络爬虫的问题,只是一直看到被人写的代码。而没有真正的做过实践,
昨天做了一下尝试,其中采用网络流行的扩展类库 http://html-agility-pack.net/?z=codeplex
遇到的问题是:部分网站禁止爬虫,或者有规则验证,无法通过模拟http 请求获取 html
本测试案例 通过模拟http 请求获取html ,通过Html Agility Pack 分析节点,获取对应节点的值,其中本案例采用的是:赶集网的数据
代码如下:
private static void ClearnHtml(string html)
{
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(html);
var list = new List<Room>();
var sb = new StringBuilder();//f-list-item ershoufang-list
HtmlAgilityPack.HtmlNodeCollection htmlBody = htmlDoc.DocumentNode.SelectNodes("*//div[@class='f-list-item ershoufang-list']");
foreach(HtmlAgilityPack.HtmlNode roomitem in htmlBody)
{
var room = new Room();
if (roomitem != null) {
try
{
var title = roomitem.SelectNodes("*//a[@class='js-title value title-font']").FirstOrDefault() != null ? roomitem.SelectNodes("*//a[@class='js-title value title-font']").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.Type = roomitem.SelectNodes("*//span[@class='first js-huxing']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='first js-huxing']").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "1";
room.buju = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[3]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[3]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.mianji = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[5]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[5]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.Direction = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[7]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[7]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.Floor = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[9]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[9]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.zhuangxiu = roomitem.SelectNodes("*//span[@class='last']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='last']").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.area = roomitem.SelectNodes("*//span[@class='area']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='area']").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0";
room.feature = roomitem.SelectNodes("*//dd[@class='dd-item feature']").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item feature']").FirstOrDefault().InnerText.Trim().Replace("\n", "").Replace(" ", "") : "0";
room.Price = roomitem.SelectNodes("*//div[@class='price']/span[1]").FirstOrDefault() != null ? roomitem.SelectNodes("*//div[@class='price']/span[1]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "").Replace(" ", "") : "0";
}
catch (Exception ex) {
continue;
}
}
sb.Append($"insert into room(title,Type,buju,mianji,Direction,Floor,zhuangxiu,area,feature,Price)values");
sb.Append($"('{room.title}','{room.Type}','{room.buju}','{ room.mianji}','{room.Direction}','{room.Floor}','{room.zhuangxiu}','{room.area}','{room.feature}','{room.Price}');");
//
// list.Add(room);
}
var connection = new MySqlConnection("Server=127.0.0.1;Database=personal;Uid=ken;Pwd=123456;");
connection.Execute(sb.ToString());
}

浙公网安备 33010602011771号