抓取net分页,登录模式抓取,在126邮箱中,详细项目!
下面简单介绍用正则表达式截取抓取功能。
(.*?)表示要截取的中间的内容,遇到个别不规则的可以灵活舍弃。
private void button1_Click(object sender, EventArgs e)
{
string html = FetchHtml("http://stock.eastmoney.com/channel/1049,1.html");
string pattern = "<title>机构观点</title>(.*?)最后一页</a>";
Regex r = new Regex(pattern, RegexOptions.Singleline);
Match match = r.Match(html);
html = match.Result("$1");
pattern = "<li><span class=\"middate\">(.*?)</span>\n\\[<a target=\"_blank\" href=\"(.*?)\">(.*?)</a>\\] <a target=\"_blank\" href=\"(.*?)\">(.*?)</a></li>";
// pattern = "<li>(.*?)</li>";
r = new Regex(pattern, RegexOptions.Singleline);
MatchCollection mc = r.Matches(html);
foreach (Match mm in mc)
{
MessageBox.Show(mm.Result("$1"));
MessageBox.Show(mm.Result("$2"));
MessageBox.Show(mm.Result("$3"));
MessageBox.Show(mm.Result("$4"));
MessageBox.Show(mm.Result("$5"));
}
}
剔除html
private string TripHtml(string html)
{
string temp = "";
while (html.IndexOf("script") != -1)
{
temp = html.Substring(0, html.IndexOf("<script"));
temp = temp + html.Substring(html.IndexOf("</script>") + "</script>".Length);
html = temp;
}
return Regex.Replace(html, "<[^>]+>", "").Replace(" "," ");
}
如何防止他人通过浏览器抓取自己网页呢?
string agent = System.Web.HttpContext.Current.Request.ServerVariables["HTTP_USER_AGENT"];
if (agent == null)
{
Response.Write("警告:切勿非法抓取本网页内容");
}
else
{
/ tp://192.168.5.138:8009/01%e5%ae%9e%e7%94%a8/%e8%8e%b7%e5%8f%96%e5%ae%a2%e6%88%b7%e7%ab%af/Default.aspx
// Response.Write(Server.UrlEncode("01实用") + "<br/>");
// Response.Write(Server.UrlEncode("获取客户端") + "<br/>");
// Response.Write("欢迎光临,本网站给你最新的时事资讯.");
Response.Write(agent);
}
//agent:获取的浏览器信息,如果为null,当然不是从浏览器来的,而且程序抓取

浙公网安备 33010602011771号