正则表达式抓取新闻数据

抓取网站数据建议用服务操作，此例只实现从请求到拿数据，并把数据写入xml的功能

List<youcaimodel> _list = new List<youcaimodel>();
int page = 1000;　　　　　　　　　　　　　　　　　　　　　　//一共抓取1000页的数据
WebClient client = new WebClientto(3000);
client.Credentials = CredentialCache.DefaultCredentials;　　 //获取或设置用于向Internet资源的请求进行身份验证的网络凭据
for (int k = 0; k < page; k++)　　　　　　　　　　　　　　　 //循环获取当前页的数据，建议用服务跑
{
　　string htmlstr = "";
　　if (k == 0)
　　 {
　　　　byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0");
　　　　htmlstr = Encoding.GetEncoding("utf-8").GetString(b);

　　 }else

　　 {
　　　　byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0/"+k+"");
　　　　htmlstr = Encoding.GetEncoding("utf-8").GetString(b);
　　 }

　　　　string regstr = "<ul[\\s]*>[\\s\\S]*?</ul>";
　　　　var matches = Regex.Matches(htmlstr, regstr);
　　　　int i = 0;
　　　　foreach (Match match in matches)
　　　　{
　　　　　　if (i == 4)
　　　　　　{
　　　　　　　　html = match.Value;
　　　　　　}
　　i++;
}

string regdel = "";//过滤已经注销的html代码
string regli = "<li[\\s]*[^>]*>[\\s\\S]*?</li>";//获取li下面的数据列表
string imgurl = "<img[\\s]*src[\\s]*=[\\s]*\"(?<imgurl>.*?)\"[\\s]*?[\\s]*[^>]*>";//获取图片路径
string imgtitle = "<h3[\\s]*[^>]*>[\\s]*<a[\\s]*href=[\\s]*\"(?<url>.*?)\"[^>]*>[\\s]*(?<title>.*?)[\\s]*</a>[\\s]*</h3>";//获取文章链接地址和文章标题
string imgdescribe = "<p[\\s]*[^>]*>(?<describe>.*?)</p>";//获取文章描述
string imgtime = "<span[\\s]*class=\"time\">(?<time>.*?)</span>";//获取文章发布时间
MatchCollection delmatches = Regex.Matches(html, regdel);
foreach (Match match in delmatches)
{
html = html.Replace(match.Value, "");
MatchCollection mc = new Regex(regli, RegexOptions.Compiled | RegexOptions.IgnoreCase).Matches(html);
foreach (Match limc in mc)
{
youcaimodel info = new youcaimodel();
Regex reg2 = new Regex(imgurl);
MatchCollection mc2 = reg2.Matches(limc.Value);
foreach (Match m2 in mc2)
{
info.Imgscr = m2.Groups["imgurl"].Value;
}

reg2 = new Regex(imgtitle);
MatchCollection mc3 = reg2.Matches(limc.Value);
foreach (Match m2 in mc3)
{
info.Title = m2.Groups["title"].Value;
info.Url = m2.Groups["url"].Value;
}

reg2 = new Regex(imgdescribe);
MatchCollection mc4 = reg2.Matches(limc.Value);
foreach (Match m2 in mc4)
{
info.Describe = m2.Groups["describe"].Value;
}

reg2 = new Regex(imgtime);
MatchCollection mc5 = reg2.Matches(limc.Value);
foreach (Match m2 in mc5)
{
info.Time = m2.Groups["time"].Value;
}
_list.Add(info);
}

}
}

//把数据写入本地xml

using (StringWriter stringWriter = new StringWriter(new StringBuilder()))
{
string qxfilepath = System.Web.Hosting.HostingEnvironment.MapPath("~" + "/xml");
XmlSerializer xmlSerializer = new XmlSerializer(typeof(List<youcaimodel>));
xmlSerializer.Serialize(stringWriter, _list);
//File.WriteAllText(""+ qxfilepath + "/youcai.xml", stringWriter.ToString());

FileStream fs = new FileStream("" + qxfilepath + "/youcai.xml", FileMode.OpenOrCreate);
StreamWriter sw = new StreamWriter(fs);
sw.Write(stringWriter.ToString());
sw.Close();
fs.Close();
html = stringWriter.ToString();
}

posted on 2017-02-08 11:46 Andy_陈阅读(1138) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Andy_陈

正则表达式抓取新闻数据

导航

公告