C#简单的爬虫，爬博客园首页文章标题

运行效果如图：
代码如下：
 1 using System;
 2 using System.IO;
 3 using System.Net;
 4 using System.Text;
 5 using System.Text.RegularExpressions;
 6 
 7 namespace ConsoleApplication2
 8 {
 9     class Program
10     {
11         static void Main(string[] args)
12         {
13             string host = "https://www.cnblogs.com/?";  //url后面必须加一个?不然程序请求都是404，不知道是什么原因
14             int num = 0; //统计当前为第几个文章的标题
15             int pagSize = 100; //爬取的最大页数 10表示爬首页前10页的标题
16 
17             //标题标签样例：<a class="titlelnk" href="https://www.xxxxx.html" target="_blank">【设计模式】简单工厂模式 Simple Factory Pattern</a>
18             string pater = "<a class=\"titlelnk\" href=\"(.*?)\" target=\"_blank\">(.*?)</a>";  //()为C#要捕捉的内容,括号里面的".*?"表示匹配任意内容（因为url的地址是不确定的）
19             Regex regex = new Regex(pater);
20 
21             for (int i = 1; i < pagSize; i++)
22             {
23                 //首页完整链接为https://www.cnblogs.com/#p2  #p后面的数字代表当前页
24                 string url = host + "#p" + i;
25                 var html = GetHtmlString(url);
26                 if (!string.IsNullOrEmpty(html))
27                 {
28                     //标题标签<a class="titlelnk" href="https://www.xxxxx.html" target="_blank">【设计模式】简单工厂模式 Simple Factory Pattern</a>
29                     //正则匹配标题的标签，再提取其中的名称和url
30                     foreach (Match ma in regex.Matches(html))
31                     {
32                         Match match = Regex.Match(ma.Value, pater);
33                         string title = match.Groups[2].Value;
34                         string titlelnk = match.Groups[1].Value;
35                         Console.WriteLine($"-------------------------------第{ ++num }个标题------------------------");
36                         Console.WriteLine(title + "Url:" + titlelnk);
37                         Console.WriteLine("--------------------------------------------------------------------------");
38                         File.AppendAllText(@"d:\cnblog.txt", title + "     " + titlelnk + "\r\n");
39                     }
40                 }
41             }
42             Console.WriteLine("结束一共爬了" + num + "个标题");
43             Console.ReadKey();
44         }
45 
46         /// <summary>
47         /// 请求url
48         /// </summary>
49         /// <param name="url"></param>
50         /// <returns></returns>
51         public static string GetHtmlString(string url)
52         {
53             try
54             {
55                 WebRequest request = WebRequest.Create(url);
56                 Stream stream = request.GetResponse().GetResponseStream();
57                 request.Timeout = 3000;
58                 using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
59                 {
60                     return reader.ReadToEnd();
61                 }
62             }
63             catch (Exception ex)
64             {
65                 Console.WriteLine(ex.ToString());
66                 return null;
67             }
68         }
69 
70     }
71 }
posted @ 2017-09-18 22:33 码农成长日记阅读(1609) 评论(10) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
码农成长日记

C#简单的爬虫，爬博客园首页文章标题

公告