这两天,编码做了一个新蛋网手机信息的采集,web页面信息采集是用WebClient控件。需要调用方法Gather()。希望能有帮助。
代码如下:
/*
* Created By ChinaAgan 2012-1-18
*
*/
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using CnBlogCollector.Properties;
namespace CnBlogCollector
{
/// <summary>
/// 数据采集类
/// </summary>
public class Collector
{
#region 变量
private string cnblogMain = "http://www.newegg.com.cn/SubCategory/1043-{0}.htm";//cnblog首页地址
private WebClient wc = new WebClient();
#endregion
#region 创建目录
/// <summary>
/// 判断目录是否存在,若不存在则创建该目录
/// </summary>
/// <param name="path"></param>
/// <returns></returns>
public string CreateFolderIfNot(string path)
{
//获取该目录的完整路径
string rtn = Path.GetFullPath(path);
//若该目录不存在
if (!Directory.Exists(rtn))
{
//创建该目录
Directory.CreateDirectory(rtn);
}
return rtn;
}
#endregion
#region 采集网页数据
public void Gather(int startIndex, int endIndex)
{
WebProxy webProxy = new WebProxy("proxy.cn1.global.***.com:8080");
webProxy.Credentials = new System.Net.NetworkCredential("user", "password");
wc.Proxy = webProxy;
string outContent = "";
//根据startIndex和endIndex来遍历cnblog首页上文章
for (int i = startIndex; i < endIndex; i++)
{
//从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING
string url = string.Format(cnblogMain, i.ToString());
string mainData = Encoding.GetEncoding("GB2312").GetString(wc.DownloadData(url)).Replace("\r\n", "");
string strPattern = @"<p\s+class=""info""><a\s+href=(?<url>.+?)\s+title=""(?<title>.+?)"">(?<content>.+?)</a>";
string oldPricePattern = @"<p\s+class=""bypast""><span>¥(?<OldPrice>.+?)</span></p>";
string newPricePattern = @"<p\s+class=""current""><strong\s+class=""price""><span>¥</span>(?<NewPrice>\d+?\..+?)</strong></p>";
List<string> nameList = new List<string>();
List<string> oldPriceList = new List<string>();
List<string> newPriceList = new List<string>();
string oldPrice = String.Empty;
string newPrice = String.Empty;
MatchCollection MatchesName = Regex.Matches(mainData, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection MatchesOldPrice = Regex.Matches(mainData, oldPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection MatchesNewPrice = Regex.Matches(mainData, newPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
foreach (Match NextMatch in MatchesName)
{
nameList.Add(NextMatch.Groups["content"].Value);
}
foreach (Match NextMatch in MatchesOldPrice)
{
oldPriceList.Add(NextMatch.Groups["OldPrice"].Value);
}
foreach (Match NextMatch in MatchesNewPrice)
{
newPriceList.Add(NextMatch.Groups["NewPrice"].Value);
}
for (int iLen = 0; iLen < nameList.Count; iLen++)
{
outContent += String.Format("手机名称:{0}," + "原价:{1},现价:{2}", nameList[iLen].ToString(), oldPriceList[iLen].ToString(), newPriceList[iLen].ToString()) +"\r\n";
}
// 现价和&32;之类符号的处理。
string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + ".txt";
if (File.Exists(pth))
{
File.Delete(pth);
}
File.AppendAllText(pth, outContent, Encoding.GetEncoding("GB2312"));
outContent = "";
}
}
#endregion
}
}
浙公网安备 33010602011771号