读取网页内容,一个C#的爬虫的基本步骤
里面需要的request.UserAgent需要设置一下,有些网站需要这个参数
当时写的时候没有写注释。也是后来用的时候很方便。
只要传递一个网址过去,就可以返回这个网址的HTML源代码
1 /// <summary>
2 /// 得到网页数据
3 /// </summary>
4 /// <returns>网页</returns>
5 protected string GetPageContent(string getpage)
6 {
7 try
8 {
9 CookieContainer cc = new CookieContainer();
10 string content = string.Empty;
11 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(getpage);
12 request.Method = "post";
13 request.Timeout = 10000;
14 request.ContentType = "application/x-www-form-urlencoded";
15 request.ContentLength = content.Length;//????
16 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)";
17 //request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13";
18 request.CookieContainer = cc;
19
20 Stream stream = request.GetRequestStream();
21
22 StreamWriter writer = new StreamWriter(stream, Encoding.GetEncoding("gb2312"));
23 writer.Write(content, 0, content.Length);
24 writer.Close();
25 stream.Close();
26
27 HttpWebResponse response = request.GetResponse() as HttpWebResponse;
28 response.Cookies = cc.GetCookies(request.RequestUri);
29 Stream readerStream = response.GetResponseStream();
30 StreamReader sr = new StreamReader(readerStream, Encoding.GetEncoding("gb2312"));
31 content = sr.ReadToEnd();
32 sr.Close();
33 readerStream.Close();
34 return content;
35 }
36 catch (Exception)
37 {
38 return null;
39 }
40
41 }
42
看完BLOG之后,请轻轻点击回复一下!!!!
作者:Aya Ls Broke
出处:http://www.cnblogs.com/ls9527/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
该文章也同时发布在我的独立博客中-Ls Broke。
浙公网安备 33010602011771号