using System;
using System.Text;
using System.Data;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace LocalCinemaProject.NBA
{
public class GetBBSInfoClass
{
DataTable dt = new DataTable();
private void Setdt()
{
dt.Columns.Add("ID");
dt.Columns.Add("Name");
dt.Columns.Add("URL");
dt.Columns.Add("Pic");
}
/// <summary>
/// 采集数据
/// </summary>
/// <param name="Url">需要采集的URL地址</param>
/// <param name="i">采集多少条</param>
/// <returns>DataTable</returns>
public DataTable GetHtmlCode(string Url, int i)
{
string strHtml;
strHtml = GetURlHTML(Url);
try
{
Setdt();
string TempRegex = "<span id=\"([\\S\\s]*?)\"><a href=\"([\\S\\s]*?)\">([\\S\\s]*?)</a></span>";
Regex regex = new Regex(TempRegex, RegexOptions.Compiled | RegexOptions.IgnoreCase);
int x = 1;
i++;
for (Match match = regex.Match(strHtml); match.Success; match = match.NextMatch())
{
if (x < i)
{
DataRow datarow = dt.NewRow();
datarow["ID"] = x;
datarow["Name"] = match.Groups[3].ToString();
datarow["Url"] = GetBBSUrl(Url, match.Groups[2].ToString());
datarow["Pic"] = SaveImgToLocal(datarow["Url"].ToString());
dt.Rows.Add(datarow);
dt.AcceptChanges();
x++;
}
else
{
break;
}
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return dt;
}
/// <summary>
/// 获取一条BBS连接的真正外网地址
/// </summary>
/// <param name="Url">要采集的网址</param>
/// <param name="BBSUrl">一条BBS连接</param>
/// <returns>一条BBS连接真正地址</returns>
private string GetBBSUrl(string Url, string BBSUrl)
{
string TempUrl = Url.Substring(0, Url.LastIndexOf("/") + 1);
return TempUrl + BBSUrl;
}
/// <summary>
/// 获取URL的HTML
/// </summary>
/// <param name="Url">URL地址</param>
/// <returns>HTML代码</returns>
private string GetURlHTML(string Url)
{
string strHtml = "";
try
{
StreamReader sr = null; //用来读取流
System.Text.Encoding code = System.Text.Encoding.Default; //定义编码
//构造web请求,发送请求,获取响应
WebRequest HttpWebRequest = null;
WebResponse HttpWebResponse = null;
HttpWebRequest = WebRequest.Create(Url);
HttpWebResponse = HttpWebRequest.GetResponse();
//获得流
sr = new StreamReader(HttpWebResponse.GetResponseStream(), code);
strHtml = sr.ReadToEnd();
sr.Close();
//sr.Dispose();
HttpWebResponse.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return strHtml;
}
/// <summary>
/// 获取一条BBS连接内力的图片和图片地址
/// </summary>
/// <param name="HtmlCode">HtmlCode代码</param>
/// <param name="regex1"></param>
/// <returns></returns>
private string SaveImgToLocal(string URl)
{
string HTML = GetURlHTML(URl);
string Temp = "<img src=\"([\\S\\s]*?)\">";
string Contm = "<div id=\"ad_thread3_0\">([\\S\\s]*?)<td class=\"postauthor\">";
Regex regex1 = new Regex(Contm, RegexOptions.Compiled | RegexOptions.IgnoreCase);
Match match = regex1.Match(HTML);
string PicUrl = "";
regex1 = new Regex(Temp, RegexOptions.Compiled | RegexOptions.IgnoreCase);
match = regex1.Match(match.Groups[1].ToString());
PicUrl = match.Groups[1].ToString();
return PicUrl;
}
}
}
调用方法:
GetBBSInfoClass myClass = new GetBBSInfoClass(); DataTable m_table = myClass.GetHtmlCode("http://bbs.le8le.com/forumdisplay.php?fid=62",7);//抓取网页地址,几条记录
关键是正则表达式