posts - 7,  comments - 24,  trackbacks - 0
		/// <summary>
		/// 读取URL数据内容
		/// </summary>
		/// <param name="url">网址</param>
		/// <returns>网站文本内容</returns>
		public string HttpGetText(string url)
		{
			HttpWebRequest Request = (HttpWebRequest)HttpWebRequest.Create(url);
			Request.Method = "GET";
			Request.ContentType = @"application/x-www-form-urlencoded";
			Request.Accept = @"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
			Request.Headers.Add(HttpRequestHeader.AcceptLanguage, @"Accept-Language:zh-CN,zh;q=0.8");
			Request.UserAgent = @"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0";
			Request.Proxy = proxy;
			var stream = Request.GetResponse().GetResponseStream();
			var read = new StreamReader(stream);
			var json = read.ReadToEnd();
			read.Close();
			stream.Close();

			System.Diagnostics.Debug.WriteLine("".PadRight(20, '='));
			System.Diagnostics.Debug.WriteLine(json);
			System.Diagnostics.Debug.WriteLine("".PadRight(20, '='));
			return json;
		}

public class IPs {
			public List<proxy> items = new List<proxy>();

			public class proxy {
				public string ip;
				public int port;
				public string address;
				public int speed;
				public int life;//持续分钟数
				public DateTime check_time;
			}
		}
		private void button1_Click(object sender, EventArgs e)
		{
			var html= HttpGetText("http://www.xicidaili.com/nt");
			int i1= html.IndexOf("<table id=\"ip_list\">");
			int i2= html.IndexOf("</table>");
			string ip_list = html.Substring(i1, i2 - i1+ "</table>".Length);
			var find = new Regex(@"<tr.*?>\s*?<td.*?>.*?</td>\s*?<td.*?>(?<ip>.*?)</td>\s*?<td.*?>(?<port>.*?)</td>\s*?<td.*?>\s*?<a.*?>(?<address>.*?)</a>\s*?</td>.*?width:(?<speed>.*?)%.*?<td>(?<life>.*?)</td>.*?<td>(?<check_time>.*?)</td>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
			var ips= find.Matches(ip_list);
			listView1.BeginUpdate();
			foreach (System.Text.RegularExpressions.Match item in ips) {
				try
				{
					var ip = new IPs.proxy();
					ListViewItem lvi = new ListViewItem(item.Groups["ip"].Value); ip.ip = item.Groups["ip"].Value;
					lvi.SubItems.Add(item.Groups["port"].Value); ip.port = Convert.ToInt32(item.Groups["port"].Value);
					lvi.SubItems.Add(item.Groups["address"].Value); ip.address = item.Groups["address"].Value;
					lvi.SubItems.Add(item.Groups["speed"].Value); ip.speed = Convert.ToInt32(item.Groups["speed"].Value);
					lvi.SubItems.Add(item.Groups["life"].Value); ip.life = conv(item.Groups["life"].Value);
					lvi.SubItems.Add(item.Groups["check_time"].Value); ip.check_time = Convert.ToDateTime(item.Groups["check_time"].Value);
					listView1.Items.Add(lvi);
					IPaddress.items.Add(ip);
				}
				catch {
					LogAdd("转换IP地址信息出错 " + item.Value);
				}
			}
			listView1.EndUpdate();
			int conv(string life) {
				int a = 1;
				if (life.Contains("天")) {
					a = 60 * 24;
					life = life.Replace("天", "");
				}else if (life.Contains("分钟"))
				{
					a =1;
					life = life.Replace("分钟", "");
				}
				else if (life.Contains("小时"))
				{
					a = 60;
					life = life.Replace("小时", "");
				}
				return Convert.ToInt32(life)*a;
			}
		}

  

关键代码就是获取指定网页里的IP代理信息,然后用正则表达式提取出来

本来想着直接将html转换为xml,谁知它网页写的不标准,转换不成功

只有用正则来查找了,效果不错~

代码运行环境: vs2017

当然老版本也可以,将局部函数代码放到外部即可。

效果图:

 

关键代码部分:

var html= HttpGetText("http://www.xicidaili.com/nt");
			int i1= html.IndexOf("<table id=\"ip_list\">");
			int i2= html.IndexOf("</table>");
			string ip_list = html.Substring(i1, i2 - i1+ "</table>".Length);
			var find = new Regex(@"<tr.*?>\s*?<td.*?>.*?</td>\s*?<td.*?>(?<ip>.*?)</td>\s*?<td.*?>(?<port>.*?)</td>\s*?<td.*?>\s*?<a.*?>(?<address>.*?)</a>\s*?</td>.*?width:(?<speed>.*?)%.*?<td>(?<life>.*?)</td>.*?<td>(?<check_time>.*?)</td>.*?</tr>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
			var ips= find.Matches(ip_list);

  正则表达式推荐一个网址及学习工具:

http://deerchao.net/tutorials/regex/regex.htm#charclass

 

我本人也记不住 正则表达式  ,需要用的时候现查。

posted on 2017-04-20 13:58 fxyc87 阅读(...) 评论(...) 编辑 收藏