正则表达式抓取新闻数据

抓取网站数据建议用服务操作,此例只实现从请求到拿数据,并把数据写入xml的功能

List<youcaimodel> _list = new List<youcaimodel>();
int page = 1000;                      //一共抓取1000页的数据
WebClient client = new WebClientto(3000);
client.Credentials = CredentialCache.DefaultCredentials;    //获取或设置用于向Internet资源的请求进行身份验证的网络凭据
for (int k = 0; k < page; k++)                //循环获取当前页的数据,建议用服务跑
{
  string htmlstr = "";
  if (k == 0)
    {
    byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0");
    htmlstr = Encoding.GetEncoding("utf-8").GetString(b);

    }else

   {
    byte[] b = client.DownloadData("http://www.yocajr.com/news/index/0/"+k+"");
    htmlstr = Encoding.GetEncoding("utf-8").GetString(b);
   }

    string regstr = "<ul[\\s]*>[\\s\\S]*?</ul>";
    var matches = Regex.Matches(htmlstr, regstr);
    int i = 0;
    foreach (Match match in matches)
    {
      if (i == 4)
      {
        html = match.Value;
      }
  i++;
}


string regdel = "<!--[\\s]*.*?[\\s\\S]*-->";//过滤已经注销的html代码
string regli = "<li[\\s]*[^>]*>[\\s\\S]*?</li>";//获取li下面的数据列表
string imgurl = "<img[\\s]*src[\\s]*=[\\s]*\"(?<imgurl>.*?)\"[\\s]*?[\\s]*[^>]*>";//获取图片路径
string imgtitle = "<h3[\\s]*[^>]*>[\\s]*<a[\\s]*href=[\\s]*\"(?<url>.*?)\"[^>]*>[\\s]*(?<title>.*?)[\\s]*</a>[\\s]*</h3>";//获取文章链接地址和文章标题
string imgdescribe = "<p[\\s]*[^>]*>(?<describe>.*?)</p>";//获取文章描述
string imgtime = "<span[\\s]*class=\"time\">(?<time>.*?)</span>";//获取文章发布时间
MatchCollection delmatches = Regex.Matches(html, regdel);
foreach (Match match in delmatches)
{
html = html.Replace(match.Value, "");
MatchCollection mc = new Regex(regli, RegexOptions.Compiled | RegexOptions.IgnoreCase).Matches(html);
foreach (Match limc in mc)
{
youcaimodel info = new youcaimodel();
Regex reg2 = new Regex(imgurl);
MatchCollection mc2 = reg2.Matches(limc.Value);
foreach (Match m2 in mc2)
{
info.Imgscr = m2.Groups["imgurl"].Value;
}

reg2 = new Regex(imgtitle);
MatchCollection mc3 = reg2.Matches(limc.Value);
foreach (Match m2 in mc3)
{
info.Title = m2.Groups["title"].Value;
info.Url = m2.Groups["url"].Value;
}

reg2 = new Regex(imgdescribe);
MatchCollection mc4 = reg2.Matches(limc.Value);
foreach (Match m2 in mc4)
{
info.Describe = m2.Groups["describe"].Value;
}

reg2 = new Regex(imgtime);
MatchCollection mc5 = reg2.Matches(limc.Value);
foreach (Match m2 in mc5)
{
info.Time = m2.Groups["time"].Value;
}
_list.Add(info);
}

}
}

 

//把数据写入本地xml

using (StringWriter stringWriter = new StringWriter(new StringBuilder()))
{
string qxfilepath = System.Web.Hosting.HostingEnvironment.MapPath("~" + "/xml");
XmlSerializer xmlSerializer = new XmlSerializer(typeof(List<youcaimodel>));
xmlSerializer.Serialize(stringWriter, _list);
//File.WriteAllText(""+ qxfilepath + "/youcai.xml", stringWriter.ToString());

FileStream fs = new FileStream("" + qxfilepath + "/youcai.xml", FileMode.OpenOrCreate);
StreamWriter sw = new StreamWriter(fs);
sw.Write(stringWriter.ToString());
sw.Close();
fs.Close();
html = stringWriter.ToString();
}

posted on 2017-02-08 11:46  Andy_陈  阅读(1138)  评论(0编辑  收藏  举报

导航