HtmlAgilityPack抓取搜房网数据简单示例

HtmlAgilityPack是一个开源的解析HTML元素的类库,最大的特点是可以通过XPath来解析HMTL,如果您以前用C#操作过XML,那么使用起HtmlAgilityPack也会得心应手。目前最新版本为1.4.6。

 

程序示例如下:

 

 

代码如下:

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WinformHtmlAgilityPack
{
    public partial class Form1 : Form
    {

        public Form1()
        {
            InitializeComponent();
        }
        private void Form1_Load(object sender, EventArgs e)
        {
            Thread thread = new Thread(DownloadData);
            thread.Start();
        }
        private void DownloadData()
        {
            GZipWebClient c = new GZipWebClient();
            List<HouseModel> houseList = new List<HouseModel>();
            int pageCount = 50;
            int index = 1;
            for (int m = 1; m <= pageCount; m++)
            {
                byte[] data = c.DownloadData(new Uri("http://newhouse.fang.com/house/s/b9" + m + "/"));
                string strx = Encoding.Default.GetString(data);

                HtmlWeb htmlWeb = new HtmlWeb();
                HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                htmlDoc.LoadHtml(strx);


                //HtmlNode node = htmlDoc.GetElementbyId("sjina_C01_37");
                //HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='nl_con clearfix']");
                //HtmlNode list = node.SelectSingleNode("//ul");

                //根据xpath ul下的li   li[2]代表索引   得到 名称  
                //HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[1]/div/div[2]/div[1]/div[1]/a");
                //根据xpath ul下的li   li[2]代表索引   得到 价格
                //HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[3]/div/div[2]/div[4]/span");
                for (int i = 1; i <= 20; i++)
                {
                    //通过Xpath选中html的指定元素

                    HouseModel houseModel = new HouseModel();
                    HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[1]/div[1]/a");
                    if (nodeli_name != null)
                    {
                        houseModel.楼盘名称 = NoHTML(nodeli_name.InnerHtml);
                    }
                    HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/span");
                    if (nodeli_price != null)
                    {
                        houseModel.价格 = NoHTML(nodeli_price.InnerHtml);
                    }
                    HtmlNode nodeli_unit = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/em");
                    if (nodeli_unit != null)
                    {
                        houseModel.单位 = NoHTML(nodeli_unit.InnerHtml);
                    }

                    HtmlNode nodeli_address = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='bx1']/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[2]/div[1]/a");
                    if (nodeli_address != null)
                    {
                        string district = nodeli_address.InnerHtml.Split(']')[0];
                        houseModel.所在区域 = NoHTML(district).Substring(1);
                        houseModel.地址 = NoHTML(nodeli_address.InnerHtml);
                    }
                    houseModel.序号 = index;
                    houseList.Add(houseModel);
                    this.label1.Invoke(new MethodInvoker(delegate
                    {
                        this.label1.Text = "已经抓取:"+houseList.Count.ToString()+" 条.....";

                    }));
                    index++;

                    this.dataGridView1.Invoke(new MethodInvoker(delegate
                    {
                        this.dataGridView1.DataSource = null;
                        this.dataGridView1.DataSource = houseList;
                        this.dataGridView1.Columns[5].Width = 500;

                        this.dataGridView1.FirstDisplayedScrollingRowIndex = this.dataGridView1.Rows.Count - 1; 
                    }));

                    Thread.Sleep(100);
                }
            }
            this.label1.Invoke(new MethodInvoker(delegate
            {
                this.label1.Text = "已经抓取完毕。";

            }));
        }
        public class GZipWebClient : WebClient
        {
            protected override WebRequest GetWebRequest(Uri address)
            {
                HttpWebRequest webrequest = (HttpWebRequest)base.GetWebRequest(address);
                webrequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
                return webrequest;
            }
        }
        public class HouseModel
        {
            public int 序号 { get; set; }
            public string 楼盘名称 { get; set; }
            public string 所在区域 { get; set; }
            public string 价格 { get; set; }
            public string 单位 { get; set; }
            public string 地址 { get; set; }
        }


        private void btn_exportexcel_Click(object sender, EventArgs e)
        {
            DataGridViewToExcel(this.dataGridView1);
        }
        #region
        private void DataGridViewToExcel(DataGridView dgv)
        {
            SaveFileDialog dlg = new SaveFileDialog();
            dlg.Filter = "Execl files (*.xls)|*.xls";
            dlg.FilterIndex = 0;
            dlg.RestoreDirectory = true;
            dlg.CreatePrompt = true;
            dlg.Title = "保存为Excel文件";

            if (dlg.ShowDialog() == DialogResult.OK)
            {
                Stream myStream;
                myStream = dlg.OpenFile();
                StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding(-0));
                string columnTitle = "";
                try
                {
                    //写入列标题   
                    for (int i = 0; i < dgv.ColumnCount; i++)
                    {
                        if (i > 0)
                        {
                            columnTitle += "\t";
                        }
                        columnTitle += dgv.Columns[i].HeaderText;
                    }
                    sw.WriteLine(columnTitle);

                    //写入列内容   
                    for (int j = 0; j < dgv.Rows.Count; j++)
                    {
                        string columnValue = "";
                        for (int k = 0; k < dgv.Columns.Count; k++)
                        {
                            if (k > 0)
                            {
                                columnValue += "\t";
                            }
                            if (dgv.Rows[j].Cells[k].Value == null)
                                columnValue += "";
                            else
                                columnValue += dgv.Rows[j].Cells[k].Value.ToString().Trim();
                        }
                        sw.WriteLine(columnValue);
                    }
                    sw.Close();
                    myStream.Close();
                }
                catch (Exception e)
                {
                    MessageBox.Show(e.ToString());
                }
                finally
                {
                    sw.Close();
                    myStream.Close();
                }
            }
        }
        #endregion
        public static string NoHTML(string Htmlstring)
        {
            //删除脚本   
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
            //删除HTML   
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

            Htmlstring.Replace("<", "");
            Htmlstring.Replace(">", "");
            Htmlstring.Replace("\r\n", "");
            return Htmlstring;
        } 
    }
}

 源码地址:https://files.cnblogs.com/files/sunyj/WinformHtmlAgilityPack.rar

posted @ 2015-12-07 09:32  青春岁月,无怨无悔  阅读(504)  评论(0编辑  收藏  举报