使用到了以下技术点:
1)webclient获得网页源码;
2)正则表达式,解析网页中想要的数据;
3)使用线程池加快网页采集数据。
4)……
以前写过几次类似的,但是找不到了,又重新写了一个。
代码比较粗糙,求拍砖。
using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;
namespace SpiderMan
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
/// <summary>
/// 线程数量
/// </summary>
private static int threadCount = 0;
private void button1_Click(object sender, EventArgs e)
{
string urlPattern = "http://www.3464.com/data/zhongguochengshijingweidu/?PageNo={0}";
int pageFirstIndex = 1;
int pageLastIndex = 125;
for (int pageIndex = pageFirstIndex; pageIndex <= pageLastIndex; pageIndex++)
{
string url = string.Format(urlPattern, pageIndex);
Log("开始读取url:" + url);
ThreadPool.QueueUserWorkItem(aurl =>
{
string html = GetHttpSource((string)aurl);
ParseHtml(html);
//线程计数--
Interlocked.Decrement(ref threadCount);
}, url);
//线程计数++
Interlocked.Increment(ref threadCount);
}
while (true)
{
Application.DoEvents();
Loading();
if (threadCount <= 0)
{
break;
}
}
//Thread.Sleep(1000);
Log("数据采集结束");
}
#region 解析html
/// <summary>
/// 解析html
/// </summary>
/// <param name="html"></param>
private void ParseHtml(string html)
{
var beginPos = html.IndexOf("编号");
var endPos = html.IndexOf("</table>", beginPos);
var partHtml = html.Substring(beginPos, endPos - beginPos);
/*
<tr[^<]*<td[^>]*>(?<id>\d*?)</td>[^>]*>(?<prov>\w*)</td>[^>]*>[^>]*>(?<city>\w*)</a></td>[^>]*>(?<city2>\w*)</td>[^>]*>(?<py>\w*)</td>[^>]*>(?<qh>\w*)</td>[^>]*>(?<yb>\w*)</td>[^>]*>(?<dj>[\d\.]*)</td>[^>]*>(?<bw>[\d\.]*)</td>[^>]*>
*/
var ms = Regex.Matches(partHtml,
@"<tr[^<]*<td[^>]*>(?<id>\d*?)</td>[^>]*>(?<prov>\w*)</td>[^>]*>[^>]*>(?<city>\w*)</a></td>[^>]*>(?<city2>\w*)</td>[^>]*>(?<py>\w*)</td>[^>]*>(?<qh>\w*)</td>[^>]*>(?<yb>\w*)</td>[^>]*>(?<dj>[\d\.]*)</td>[^>]*>(?<bw>[\d\.]*)</td>[^>]*>");
foreach (Match m in ms)
{
if (!m.Success)
{
Log("解析错误:" + m.Value);
continue;
}
//Log(partHtml);
var 编码 = m.Groups["id"].Value;
var 省市 = m.Groups["prov"].Value;
var 地区市 = m.Groups["city"].Value;
var 市县 = m.Groups["city2"].Value;
var 拼音 = m.Groups["py"].Value;
var 区号 = m.Groups["qh"].Value;
var 邮编 = m.Groups["yb"].Value;
var 东经 = m.Groups["dj"].Value;
var 北纬 = m.Groups["bw"].Value;
Log(string.Format("{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}", 编码, 省市, 地区市, 市县, 拼音, 区号, 邮编, 东经, 北纬));
}
}
#endregion
/// <summary>
/// 简易进度条
/// </summary>
private void Loading()
{
if (this.InvokeRequired)
{
this.Invoke(new MethodInvoker(Loading));
}
else
{
int maxLength = 100;
int residue = maxLength - this.Text.Length;
this.Text = "采集中" + new StringBuilder().Append('.', residue).ToString();
}
}
#region Log
/// <summary>
/// 简易控制台输出
/// </summary>
/// <param name="msg"></param>
private void Log(string msg)
{
if (this.textBox1.InvokeRequired)
{
this.Invoke(new MethodInvoker(() => Log(msg)));
}
else
{
this.textBox1.AppendText(msg);
this.textBox1.AppendText(System.Environment.NewLine);
}
}
#endregion
#region GetHttpSource
/// <summary>
/// 获得网页源码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private string GetHttpSource(string url)
{
//请求别人的网站温柔点
Thread.Sleep(new Random().Next(100, 500));
var wc = new WebClient {Encoding = Encoding.Default};
wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
var source = wc.DownloadString(url);
return source;
}
#endregion
}
}
浙公网安备 33010602011771号