爬虫
爬虫,该实例 从挂号网按医院->科室->医生列表->医生详细
获取医生的详细信息。
WebClient rootpage = new WebClient(); rootpage.Encoding = Encoding.UTF8; rootpage.BaseAddress = "http://www.guahao.com/hospital/"; string url = "125336070937502";//上海中山医院 string html = string.Empty; html = rootpage.DownloadString(url);//读取网页信息 HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(html);//加载对象 HtmlNode rootnode = document.DocumentNode; HtmlNodeCollection list = rootnode.SelectNodes("//span[@data-exp='1']");//搜索data-exp=1的span标签 科室页面 /////////声明 HtmlNode node; string departmenturl; WebClient dpt = new WebClient(); dpt.Encoding = Encoding.UTF8; string dptstring = string.Empty; HtmlNodeCollection dorlist; int pagecount = 0; HtmlAgilityPack.HtmlDocument deptducoment = new HtmlAgilityPack.HtmlDocument(); ///////////dpt HtmlNode dor; string dordetailurl; WebClient dordetail = new WebClient(); dordetail.Encoding = Encoding.UTF8; string dordetailstring = string.Empty; string name; string dept; string hos; string skill; string word; string img; HtmlAgilityPack.HtmlDocument dorducoment = new HtmlAgilityPack.HtmlDocument(); ///////////dor ////////end for (int i = 0; i < list.Count; i++)//遍历科室 { node = list[i].SelectSingleNode("a"); departmenturl = node.Attributes[0].Value; departmenturl = departmenturl.Replace("department", "department/shiftcase");//科室连接 dptstring = dpt.DownloadString(departmenturl);//加载科室所有医生页面信息 deptducoment.LoadHtml(dptstring); HtmlNode dptroot = deptducoment.DocumentNode; dorlist = dptroot.SelectNodes("//div[@class='doc-info']");//获得页面信息中医生的列表 pagecount = int.Parse(dptroot.SelectSingleNode("//div[@class='other-info']").SelectNodes("span")[0].SelectSingleNode("label").InnerHtml);//网页的总页数 //页码循环 for (int j = 1; j <= pagecount; j++) { //医生列表循环 for (int k = 0; k < dorlist.Count; k++) { dordetailurl = dorlist[k].SelectNodes("div")[0].SelectSingleNode("a").Attributes["href"].Value;//医生详细页面URL dordetailstring = dordetail.DownloadString(dordetailurl); dorducoment.LoadHtml(dordetailstring); dor = dorducoment.DocumentNode; name = dor.SelectSingleNode("//p[@class='doc-fav-num js-info']").SelectNodes("a")[0].Attributes["data-name"].Value; dept = dor.SelectSingleNode("//p[@class='doc-fav-num js-info']").SelectNodes("a")[0].Attributes["data-dept"].Value; hos = dor.SelectSingleNode("//p[@class='doc-fav-num js-info']").SelectNodes("a")[0].Attributes["data-hos"].Value; img = dor.SelectSingleNode("//img[@alt='" + name + "']").Attributes["src"].Value; skill = dor.SelectSingleNode("//div[@class='skill-msg']").InnerHtml.Replace("\r\n", "").Replace("<span>", "").Replace("</span>", ""); word = dor.SelectSingleNode("//span[@class='more-content']").InnerHtml.Replace("\r\n", "").Replace("<span>", "").Replace("</span>", ""); if (word == "" || word == null) { word = dor.SelectSingleNode("//div[@class='word-msg']").InnerHtml.Replace("\r\n", "").Replace("<span>", "").Replace("</span>", ""); } Console.Write("医院:" + hos + " 科室:" + dept + " 姓名:" + name + " 擅长:" + skill + " 简介:" + word + " 图片地址:" + img); Console.WriteLine(" "); } dptstring = dpt.DownloadString(departmenturl + "?pageNo=" + j);//加载科室所有医生页面信息 deptducoment.LoadHtml(dptstring); dptroot = deptducoment.DocumentNode; dorlist = dptroot.SelectNodes("//div[@class='doc-info']");//获得页面信息中医生的列表 } } Console.Read();
需要引用htmlagilitypack.DLL
思路是 先请求页面然后加载为一个HtmlAgilityPack.HtmlDocument的对象,在对象里获得根节点 HtmlNode rootnode = document.DocumentNode;,
在这对象里可以用XPATH的方式进行操作, HtmlNodeCollection list = rootnode.SelectNodes("//span[@data-exp='1']");//搜索data-exp=1的span标签 科室页面
获得需要的节点(html标签)后,可以通过Attributes获得该标签的各种数据,如下一层的url等。在请求下一层。依次循环 最后获得需要的数据。然后可以组装为excel导出或其他要
求保存数据。
浙公网安备 33010602011771号