C# 实现抓取网页内容(一)

一、窗体应用程序界面:

二、上源码:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WebCatchTest0911
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
public static CookieCollection CC = new CookieCollection();
private void btn_Start_Click(object sender, EventArgs e)
{
string str = GetWebPageSource(textBox1.Text.Trim());
}

public static string GetWebPageSource(string Url)
{
if (Url.Contains("about"))
{
Url = Url.Replace("about", "http");
}
try
{
//http://brand.tmall.com/brandMap.htm
HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(Url);
MyRequest.Method = "GET";
MyRequest.Headers.Add("Accept-Encoding", "GBK");
MyRequest.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
MyRequest.Headers.Add("Cache-Control", "max-age=0");
MyRequest.KeepAlive = true;
MyRequest.Host = "www.icoolbr.com";
MyRequest.ProtocolVersion = HttpVersion.Version11;
MyRequest.ContentType = "text/html; charset=GBK";
MyRequest.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36";
MyRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
MyRequest.AllowAutoRedirect = true;
MyRequest.CookieContainer = new CookieContainer();
MyRequest.CookieContainer.Add(CC);
HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();
StreamReader srd = new StreamReader(MyResponse.GetResponseStream(), Encoding.GetEncoding("GBK"));
string txt = srd.ReadToEnd();
CC = MyResponse.Cookies;
srd.Close();
srd.Dispose();
return txt;
}
catch { return ""; }
}
}
}

三、总结

1)、HttpWebRequest的参数可以通过浏览器查看(F12);

2)、注意释放资源;

四、下章实现提取网页内容

 

posted @ 2015-09-11 17:54  飙速  阅读(1185)  评论(0编辑  收藏  举报