项目总结之数据抓取程序
需求,公司希望能够从自己的主页搜索其他公司主页查询数据,如此涉及到数据抓取功能。
实现原理,基于是WEB页面抓取,可以在内部代码中发送HTTP请求,返回HTML结果集。然后,在HTML结果集中解析本公司想要的数据信息。解析可以通过很多种方式,但是最常用的是正则表达式。
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
public class GetHtmlByPost
{
public static string GetHtml(string DefaultIP, string DefaultUrl, string postData)
{
try
{
Encoding encoding = Encoding.GetEncoding("GB2312");
string strUrl = @"http://" + DefaultIP + "" + DefaultUrl + "";
//string postData = "Pfx=" + Pfx + "&Carrier=" + Carrier + "&Shipment=" + Shipment + "";
byte[] data = encoding.GetBytes(postData);
// 准备HTTP请求...
System.Net.HttpWebRequest myRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(strUrl);
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = data.Length;
System.IO.Stream newStream = myRequest.GetRequestStream();
// 发送数据
newStream.Write(data, 0, data.Length);
newStream.Close();
//HTTP响应
System.Net.HttpWebResponse myResponse = (System.Net.HttpWebResponse)myRequest.GetResponse();
System.IO.Stream receiveStream = myResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("GB2312");
System.IO.StreamReader readStream = new System.IO.StreamReader(receiveStream, encode);
Char[] read = new Char[256];
int count = readStream.Read(read, 0, 256);
StringBuilder sb = new StringBuilder("");
while (count > 0)
{
String readstr = new String(read, 0, count);
sb.Append(readstr);
count = readStream.Read(read, 0, 256);
}
myResponse.Close();
readStream.Close();
return sb.ToString();
}
catch { return ""; }
}
}
{
public static string GetHtml(string DefaultIP, string DefaultUrl, string postData)
{
try
{
Encoding encoding = Encoding.GetEncoding("GB2312");
string strUrl = @"http://" + DefaultIP + "" + DefaultUrl + "";
//string postData = "Pfx=" + Pfx + "&Carrier=" + Carrier + "&Shipment=" + Shipment + "";
byte[] data = encoding.GetBytes(postData);
// 准备HTTP请求...
System.Net.HttpWebRequest myRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(strUrl);
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = data.Length;
System.IO.Stream newStream = myRequest.GetRequestStream();
// 发送数据
newStream.Write(data, 0, data.Length);
newStream.Close();
//HTTP响应
System.Net.HttpWebResponse myResponse = (System.Net.HttpWebResponse)myRequest.GetResponse();
System.IO.Stream receiveStream = myResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("GB2312");
System.IO.StreamReader readStream = new System.IO.StreamReader(receiveStream, encode);
Char[] read = new Char[256];
int count = readStream.Read(read, 0, 256);
StringBuilder sb = new StringBuilder("");
while (count > 0)
{
String readstr = new String(read, 0, count);
sb.Append(readstr);
count = readStream.Read(read, 0, 256);
}
myResponse.Close();
readStream.Close();
return sb.ToString();
}
catch { return ""; }
}
}
解析HTML代码经常会用到,获取<td></td>之间的数据,这里有一个通用表达式 @"(?is)(?<=>)[^<]+(?=<)"。
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
//分组得到<td></td>数据
MatchCollection strMatchCollection = Regex.Matches(“解析的字符串”, @"(?is)(?<=>)[^<]+(?=<)");
//节点1
value1 = strMatchCollection[0].Value.ToString();
//节点2
value1 = strMatchCollection[1].Value.ToString();
MatchCollection strMatchCollection = Regex.Matches(“解析的字符串”, @"(?is)(?<=>)[^<]+(?=<)");
//节点1
value1 = strMatchCollection[0].Value.ToString();
//节点2
value1 = strMatchCollection[1].Value.ToString();
value1,value2即td节点数据值。