csharp: using HtmlAgilityPack and ScrapySharp reading Url find text

https://github.com/exaphaser/ScrapySharp

https://github.com/zzzprojects/html-agility-pack

https://github.com/atifaziz/Fizzler

https://archive.codeplex.com/?p=fizzlerex

https://github.com/aspnet/blazor

https://github.com/SteveSanderson/Blazor

https://www.mathjax.org/#samples 数学公式

 https://github.com/Ivony/Jumony

https://github.com/GeReV/NSoup

https://github.com/robinvanderknaap/MvcJqGrid

http://www.defenseinnovationmarketplace.mil/strategy.html

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Collections;
using ScrapySharp;
using ScrapySharp.Network;
using ScrapySharp.Core;
using HtmlAgilityPack;


namespace HtmlAgilityPackDemo
{

    /// <summary>
    /// HTML解析利器HtmlAgilityPack
    /// geovindu
    /// 涂聚文
    /// 20180305
    /// </summary>
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Form1_Load(object sender, EventArgs e)
        {
            this.textBox1.Text = "ln";

            //List<CityList> lis=new List<CityList>();
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetWebClient(string url)
        {
            string strHTML = "";
            WebClient myWebClient = new WebClient();
            Stream myStream = myWebClient.OpenRead(url);
            StreamReader sr = new StreamReader(myStream, Encoding.Default);//注意编码
            strHTML = sr.ReadToEnd();
            myStream.Close();
            return strHTML;
        }

        /// <summary>
        /// nl
        /// </summary>
        /// <param name="cityCode"></param>
        public  string ParsePageByArea(String cityCode, out List<CityList> listcity)
        {
            StringBuilder stp = new StringBuilder();
            CityList city = null;
            List<CityList> clits = new List<CityList>();
            //更加链接格式和省份代码构造URL
            String url = String.Format("http://www.tianqihoubao.com/lishi/{0}.htm", cityCode);
            //下载网页源代码 
            var docText = GetWebClient(url);
            //加载源代码,获取文档对象
            var doc = new HtmlAgilityPack.HtmlDocument(); 
            doc.LoadHtml(docText);
            //更加xpath获取总的对象,如果不为空,就继续选择dl标签
            var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[1]/div[6]/div[1]/div[1]/div[3]");
            if (res != null)
            {
                var list = res.SelectNodes(@"dl");//选择标签数组
                if (list.Count < 1)
                {
                    listcity = clits;
                    return "";
                }
                foreach (var item in list)
                {
                    var dd = item.SelectSingleNode(@"dd").SelectNodes("a");
                    foreach (var node in dd)
                    {
                        city = new CityList();
                        var text = node.InnerText.Trim();
                        //拼音代码要从href属性中进行分割提取
                        var herf = node.Attributes["href"].Value.Trim().Split('/', '.');
                       string str= string.Format("{0}:{1}", text, herf[herf.Length - 2]);
                       city.CityName = text;
                       city.CityCode = herf[herf.Length - 2];
                       stp.Append("\r\n" + str);
                       clits.Add(city);

                    }
                }                
            }
            listcity = clits;
            return stp.ToString();
        }
        /// <summary>
        /// http://www.tianqihoubao.com/lishi/dalian/month/201802.html
        /// </summary>
        /// <param name="cityCode"></param>
        /// <param name="year"></param>
        /// <param name="month"></param>
        public  string ParsePageByCityMonth(String cityCode, Int32 year, Int32 month,out List<WeatherList> wea)
        {
            StringBuilder stp = new StringBuilder();
            List<WeatherList> wlist = new List<WeatherList>();
            WeatherList wt = null;
            //更加拼音代码,月份信息构造URL
            String url = String.Format("http://www.tianqihoubao.com/lishi/{0}/month/{1}{2:D2}.html", cityCode, year, month);
            //获取该链接的源代码
            var docText = GetWebClient(url);
            //加载源代码,获取页面结构对象
            var doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(docText);
            //更加Xpath获取表格对象
            var res = doc.DocumentNode.SelectSingleNode(@"/html[1]/body[1]/div[2]/div[6]/div[1]/div[1]/table[1]");
            if (res != null)
            {
                //获取所有行
                var list = res.SelectNodes(@"tr");
                list.RemoveAt(0);//移除第一行,是表头
                // 遍历每一行,获取日期,以及天气状况等信息
                foreach (var item in list)
                {
                    wt = new WeatherList();
                    var dd = item.SelectNodes(@"td");
                    //日期 -  - 气温 - 风力风向
                    if (dd.Count != 4) continue;
                    //获取当前行日期
                    var date1 = dd[0].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //获取当前行天气状况
                    var tq = dd[1].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //获取当前行气温
                    var qw = dd[2].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //获取当前行风力风向
                    var fx = dd[3].InnerText.Replace("\r\n", "").Replace(" ", "").Trim();
                    //输出
                    string str=string.Format("{0}:{1},{2},{3}", date1, tq, qw, fx);
                    stp.Append(str);
                    wt.Climate = tq;
                    wt.Date =DateTime.Parse(date1);
                    wt.Temperature = qw;
                    wt.WindDirection = fx;
                    wlist.Add(wt);

                }
            }
            wea = wlist;
            return stp.ToString();
        }
        /// <summary>
        /// http://www.dusystem.com/geovindu.html
        /// ScrapingBrowser
        /// 获取文件标题
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public string getHtmlTitle(string url)
        {
            StringBuilder titl = new StringBuilder();
            var uri = new Uri(url);
            var browser1 = new ScrapingBrowser();
            var html1 = browser1.DownloadString(uri);
            var doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(html1);
            var html = doc.DocumentNode;

            var title = html.SelectNodes("title");
            foreach (var htmlNode in title)
            {
                titl.Append(htmlNode.InnerText);
            }
                //CssSelect  CssSelectAncestors
            var ps = html.SelectNodes("p").Elements("div#endText");
            foreach (var htmlNode in ps)
            {
                titl.Append(htmlNode.InnerHtml);
            }

           return titl.ToString();

        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button1_Click(object sender, EventArgs e)
        {
            List<CityList> list = new List<CityList>();
            this.richTextBox1.Text = ParsePageByArea(this.textBox1.Text.Trim(),out list);
            this.comboBox1.DataSource = list;
            this.comboBox1.DisplayMember = "CityName";
            this.comboBox1.ValueMember = "CityCode";


        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button2_Click(object sender, EventArgs e)
        {
            List<WeatherList> list = new List<WeatherList>();
            int year=DateTime.Now.Year;
            int mont=DateTime.Now.Month-1;
            this.richTextBox2.Text = ParsePageByCityMonth(this.comboBox1.SelectedValue.ToString(), year, mont, out list);
            this.dataGridView1.DataSource = list;
            
        }

    }
    /// <summary>
    /// 
    /// </summary>
    public class CityList
    {
        /// <summary>
        /// 
        /// </summary>
        public string CityName { get; set; }
        /// <summary>
        /// 
        /// </summary>
        public string CityCode { get; set; }
    }

    /// <summary>
    /// Climate, temperature, wind direction
    /// </summary>
    public class WeatherList
    {
        /// <summary>
        /// 气候
        /// </summary>
        public string Climate { get; set; }
        /// <summary>
        /// 温度
        /// </summary>
        public string Temperature { get; set; }
        /// <summary>
        /// 风向
        /// </summary>
        public string WindDirection { get; set; }
        /// <summary>
        /// 
        /// </summary>
        public DateTime Date { get; set; }
    }
   
}

  

  private void button3_Click(object sender, EventArgs e)
        {
            int year = DateTime.Now.Year;
            int mont = DateTime.Now.Month - 1;
            string url = "http://www.tianqihoubao.com/lishi/dalian/month/201802.html";
            var docText = GetWebClient(url);
            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();           
            
            document.LoadHtml(docText);

           // document.OptionOutputAsXml = true;

            var divname = document.DocumentNode.Descendants("div").FirstOrDefault();

            var body = document.DocumentNode.SelectNodes("//body").Single();

            var ta = document.DocumentNode.SelectNodes("//table").Single();

            foreach (var script in document.DocumentNode.Descendants("script").ToArray())
                script.Remove();
            foreach (var style in document.DocumentNode.Descendants("style").ToArray())
                style.Remove();

           // foreach (var comment in document.DocumentNode.SelectNodes("//comment()").ToArray())
            //    comment.Remove();//新增的代码

            //document.DocumentNode.SelectSingleNode("//div[@id='myTrips']").SelectNodes(".//li");
            //是示天气的
            List<string> paragraphs = document.DocumentNode.SelectNodes("//table[@class='b']//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();

            string name = document.DocumentNode.SelectSingleNode("//td/input").Attributes["value"].Value;


           // List<string> paragraphs = document.DocumentNode.SelectNodes("//table[contains(@class, 'b')]//tr").Select(paragraphNode => paragraphNode.InnerHtml).ToList();////b: is class name
            //XPath: /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1]
            HtmlNode tablenode = document.DocumentNode.SelectSingleNode("//table[@class='b']//tr");     //b: is class name 根据XPath查找节点,跟XmlNode差不多

            

            HtmlNode node = document.DocumentNode.SelectSingleNode("//*");



            IEnumerable<HtmlNode> nodeList = node.Ancestors();  //获取该元素所有的父节点的集合
            foreach (HtmlNode item in nodeList)
            {
                Console.Write(item.Name + " ");   //输出 div div body html #document
            }

            HtmlAttributeCollection attrs = node.Attributes;
            foreach (var item in attrs)
            {
                Console.WriteLine(item.Name + " : " + item.Value);    //输出 class :user_match clear
            }

            HtmlNodeCollection CNodes = node.ChildNodes;    //所有的子节点
            foreach (HtmlNode item in CNodes)
            {
                Console.WriteLine(item.Name + "-" + item.InnerText);  //输出 别忘了文本节点也算
            }

            HtmlAttributeCollection attrs1 = node.ClosingAttributes;    //获取在结束标记的 HTML 属性的集合。  例如</ul class="">
            Console.WriteLine(attrs1.Count);    //输出0

            HtmlNode node1 = node.FirstChild;   //悲剧了ul的第一个节点是一个 \n 换行文本节点 第二个节点才到第一个li
            Console.WriteLine(node1.NodeType);  //输出Text 文本节点
            HtmlNode node3 = node.LastChild;    //同样最后一个节点一样是 \n 文本节点
            Console.WriteLine(node3.NodeType);  //输出Text 文本节点

            HtmlNode node2 = node.SelectSingleNode("child::div[1]");     //获取当前节点的第一个子li节点
            Console.WriteLine(node2.XPath);     //根据节点生成XPath表达式   /html[1]/body[1]/form[1]/div[2]/div[6]/div[1]/div[1]/table[1]/tr[1] 

            Console.WriteLine(node.HasAttributes);          //输出 True   判断节点是否含有属性
            Console.WriteLine(node.HasChildNodes);          //输出 True   判断节点是否含有子节点
            Console.WriteLine(node.HasClosingAttributes);   //False     判断节点结束标记是否含有属性

            Console.WriteLine(node.Line);           //输出 155  该节点开始标记位于页面代码的第几行
            Console.WriteLine(node.LinePosition);   //输出 1   该节点开始标记位于第几列2
            Console.WriteLine(node.NodeType);       //输出 Element   该节点类型 此处为元素节点            
            Console.WriteLine(node.OriginalName);   //输出 ul
            HtmlNode node4 = node.SelectSingleNode("child::div[1]");
            Console.WriteLine(node4.InnerText);     //输出 
            HtmlNode node5 = node4.NextSibling.NextSibling;     //获取下一个兄弟元素 因为有一个换行符的文本节点,因此要两次,跳过换行那个文本节点
            Console.WriteLine(node5.InnerText);     //输出 
            HtmlNode node6 = node5.PreviousSibling.PreviousSibling;     //同样两次以跳过换行文本节点
            Console.WriteLine(node6.InnerText);     //输出 
            HtmlNode node7 = node6.ParentNode;      //获取父节点
            Console.WriteLine(node7.Name);          //输出 ul
            string str = node.OuterHtml;
            Console.WriteLine(str);     //输出整个ul代码class="user_match clear">
            Console.WriteLine(node.StreamPosition); //输出7331    获取此节点的流位置在文档中,相对于整个文档(Html页面源代码)的开始。

            HtmlAgilityPack.HtmlDocument doc1 = node.OwnerDocument;

            foreach (HtmlAgilityPack.HtmlNode div in body.SelectNodes("//div"))
            {
                var classValue = div.Attributes["class"] == null ? null : div.Attributes["class"].Value;

                if (classValue == "first")
                {
                    //write innerText into a table at place [i][column1]
                }
                else if (classValue == "second")
                {
                    //write innerText into the same table in [i][column2]
                }
            }

            string innerText1 = document.DocumentNode.SelectSingleNode("//body").SelectNodes("//div").Single(n => n.Attributes.Any(a => a.Name == "class" && a.Value == "first")).InnerText;
        }

  

 

posted @ 2018-03-05 15:03  ®Geovin Du Dream Park™  阅读(712)  评论(3编辑  收藏  举报