机会是个小姑娘,需要我们主动点

(有所为,有所不为)
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

数据抓取,从论坛上抓取

Posted on 2008-08-14 18:26  五子登科  阅读(695)  评论(0编辑  收藏  举报

using System;

using System.Text;
using System.Data;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace LocalCinemaProject.NBA
{
 public  class GetBBSInfoClass
 {
  DataTable dt = new DataTable();
        
  private  void Setdt()
  {
   
   dt.Columns.Add("ID");
   dt.Columns.Add("Name");
   dt.Columns.Add("URL");
   dt.Columns.Add("Pic");
  }


  /// <summary>
  /// 采集数据
  /// </summary>
  /// <param name="Url">需要采集的URL地址</param>
  /// <param name="i">采集多少条</param>
  /// <returns>DataTable</returns>
  public  DataTable GetHtmlCode(string Url, int i)
  {
   string strHtml;
   strHtml = GetURlHTML(Url);
   try
   {
    Setdt();

    string TempRegex = "<span id=\"([\\S\\s]*?)\"><a href=\"([\\S\\s]*?)\">([\\S\\s]*?)</a></span>";
    Regex regex = new Regex(TempRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);
    int x = 1;
    i++;
    for (Match match = regex.Match(strHtml); match.Success; match = match.NextMatch())
    {
     if (x < i)
     {
      DataRow datarow = dt.NewRow();
      datarow["ID"] = x;
      datarow["Name"] = match.Groups[3].ToString();
      datarow["Url"] = GetBBSUrl(Url, match.Groups[2].ToString());
      datarow["Pic"] = SaveImgToLocal(datarow["Url"].ToString());

      dt.Rows.Add(datarow);
      dt.AcceptChanges();
      x++;
     }
     else
     {
      break;
     }


    }
   }
   catch (Exception ex)
   {
    throw new Exception(ex.Message);
   }
   return dt;

  }

  /// <summary>
  /// 获取一条BBS连接的真正外网地址
  /// </summary>
  /// <param name="Url">要采集的网址</param>
  /// <param name="BBSUrl">一条BBS连接</param>
  /// <returns>一条BBS连接真正地址</returns>
  private  string GetBBSUrl(string Url, string BBSUrl)
  {
   string TempUrl = Url.Substring(0, Url.LastIndexOf("/") + 1);
   return TempUrl + BBSUrl;
  }


  /// <summary>
  /// 获取URL的HTML
  /// </summary>
  /// <param name="Url">URL地址</param>
  /// <returns>HTML代码</returns>
  private  string GetURlHTML(string Url)
  {
   string strHtml = "";
   try
   {
    StreamReader sr = null;         //用来读取流
    System.Text.Encoding code = System.Text.Encoding.Default;  //定义编码

    //构造web请求,发送请求,获取响应
    WebRequest HttpWebRequest = null;
    WebResponse HttpWebResponse = null;
    HttpWebRequest = WebRequest.Create(Url);
    HttpWebResponse = HttpWebRequest.GetResponse();

    //获得流
    sr = new StreamReader(HttpWebResponse.GetResponseStream(), code);
    strHtml = sr.ReadToEnd();
    sr.Close();
    //sr.Dispose();
    HttpWebResponse.Close();
   }
   catch (Exception ex)
   {
    throw new Exception(ex.Message);
   }
   return strHtml;

  }


  /// <summary>
  /// 获取一条BBS连接内力的图片和图片地址
  /// </summary>
  /// <param name="HtmlCode">HtmlCode代码</param>
  /// <param name="regex1"></param>
  /// <returns></returns>
  private  string SaveImgToLocal(string URl)
  {
   string HTML = GetURlHTML(URl);

   string Temp = "<img src=\"([\\S\\s]*?)\">";
   string Contm = "<div id=\"ad_thread3_0\">([\\S\\s]*?)<td class=\"postauthor\">";

   Regex regex1 = new Regex(Contm, RegexOptions.Compiled | RegexOptions.IgnoreCase);
   Match match = regex1.Match(HTML);

   string PicUrl = "";
   regex1 = new Regex(Temp, RegexOptions.Compiled | RegexOptions.IgnoreCase);
   match = regex1.Match(match.Groups[1].ToString());
   PicUrl = match.Groups[1].ToString();


   return PicUrl;
  }
 }
}

调用方法:
 GetBBSInfoClass myClass = new GetBBSInfoClass(); DataTable m_table = myClass.GetHtmlCode("http://bbs.le8le.com/forumdisplay.php?fid=62",7);//抓取网页地址,几条记录

关键是正则表达式