C#网络爬虫抓取小说

心血来潮,想研究下爬虫,爬点小说。

 

通过百度选择了个小说网站,随便找了一本小书http://www.23us.so/files/article/html/13/13655/index.html

 

一、分析html规则

思路是获取小说章节目录,循环目录,抓取所有章节中的内容,拼到txt文本中。最后形成完本小说。

1、获取小说章节目录

通过分析,我在标注的地方获取小说名字及章节目录。

<meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>// 获取小说名字
<table cellspacing="1" cellpadding="0" bgcolor="#E4E4E4" id="at">// 所有的章节都在这个table中。

下面是利用正则,获取名字与目录。

// 获取小说名字
Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

// 获取章节目录
Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|\n)*?</table>");
var mat_mulu = reg_mulu.Match(html);
string mulu = mat_mulu.Groups[0].ToString();

 

2、获取小说正文内容

 

通过章节a标签中的url地址,查看章节内容。

 

 通过分析,正文内容在<dd id="contents">中。

// 获取正文
Regex reg = new Regex(@"<dd id=""contents"">(.|\n)*?</dd>");
MatchCollection mc = reg.Matches(html_z);
var mat = reg.Match(html_z);
string content = mat.Groups[0].ToString().Replace("<dd id=\"contents\">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "\r\n");

二、C#完整代码

  1 using System;
  2 using System.Collections;
  3 using System.Collections.Generic;
  4 using System.IO;
  5 using System.Linq;
  6 using System.Net;
  7 using System.Text;
  8 using System.Text.RegularExpressions;
  9 using System.Web;
 10 using System.Web.Mvc;
 11 
 12 namespace TestInsect.Controllers
 13 {
 14     public class CrawlerController : Controller
 15     {
 16         // GET: Crawler
 17         public ActionResult Index()
 18         {
 19             Index1();
 20             return View();
 21         }
 22         // GET: Crawler
 23         public void Index1()
 24         {
 25             //抓取整本小说
 26             CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说
 27             string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");
 28 
 29             // 获取小说名字
 30             Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
 31             string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
 32 
 33             // 获取章节目录
 34             Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" id=""at"">(.|\n)*?</table>");
 35             var mat_mulu = reg_mulu.Match(html);
 36             string mulu = mat_mulu.Groups[0].ToString();
 37 
 38             // 匹配a标签里面的url
 39             Regex tmpreg = new Regex("<a[^>]+?href=\"([^\"]+)\"[^>]*>([^<]+)</a>", RegexOptions.Compiled);
 40             MatchCollection sMC = tmpreg.Matches(mulu);
 41             if (sMC.Count != 0)
 42             {
 43                 //循环目录url,获取正文内容
 44                 for (int i = 0; i < sMC.Count; i++)
 45                 {
 46                     //sMC[i].Groups[1].Value
 47                     //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a> 
 48                     //1是http://www.23us.so/files/article/html/13/13655/5638725.html
 49                     //2是第一章 泰山之巅
 50 
 51                     // 获取章节标题
 52                     string title = sMC[i].Groups[2].Value;
 53 
 54                     // 获取文章内容
 55                     string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");
 56 
 57                     // 获取小说名字,章节中也可以查找名字
 58                     //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />");
 59                     //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
 60 
 61                     // 获取标题,通过分析h1标签也可以得到章节标题
 62                     //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];
 63 
 64                     // 获取正文
 65                     Regex reg = new Regex(@"<dd id=""contents"">(.|\n)*?</dd>");
 66                     MatchCollection mc = reg.Matches(html_z);
 67                     var mat = reg.Match(html_z);
 68                     string content = mat.Groups[0].ToString().Replace("<dd id=\"contents\">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "\r\n");
 69 
 70                     // txt文本输出
 71                     string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\\", "/") + "Txt/";
 72                     Novel(title + "\r\n" + content, name, path);
 73                 }
 74             }
 75         }
 76 
 77         /// <summary>
 78         /// 创建文本
 79         /// </summary>
 80         /// <param name="content">内容</param>
 81         /// <param name="name">名字</param>
 82         /// <param name="path">路径</param>
 83         public void Novel(string content, string name, string path)
 84         {
 85             string Log = content + "\r\n";
 86             // 创建文件夹,如果不存在就创建file文件夹
 87             if (Directory.Exists(path) == false)
 88             {
 89                 Directory.CreateDirectory(path);
 90             }
 91 
 92             // 判断文件是否存在,不存在则创建
 93             if (!System.IO.File.Exists(path + name + ".txt"))
 94             {
 95                 FileStream fs1 = new FileStream(path + name + ".txt", FileMode.Create, FileAccess.Write);// 创建写入文件 
 96                 StreamWriter sw = new StreamWriter(fs1);
 97                 sw.WriteLine(Log);// 开始写入值
 98                 sw.Close();
 99                 fs1.Close();
100             }
101             else
102             {
103                 FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
104                 StreamWriter sr = new StreamWriter(fs);
105                 sr.WriteLine(Log);// 开始写入值
106                 sr.Close();
107                 fs.Close();
108             }
109         }
110 
111         public string HttpPost(string Url, string postDataStr)
112         {
113             CookieContainer cookie = new CookieContainer();
114             HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
115             request.Method = "POST";
116             request.ContentType = "application/x-www-form-urlencoded";
117             request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
118             request.CookieContainer = cookie;
119             Stream myRequestStream = request.GetRequestStream();
120             StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
121             myStreamWriter.Write(postDataStr);
122             myStreamWriter.Close();
123 
124             HttpWebResponse response = (HttpWebResponse)request.GetResponse();
125 
126             response.Cookies = cookie.GetCookies(response.ResponseUri);
127             Stream myResponseStream = response.GetResponseStream();
128             StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
129             string retString = myStreamReader.ReadToEnd();
130             myStreamReader.Close();
131             myResponseStream.Close();
132 
133             return retString;
134         }
135 
136         public string HttpGet(string Url, string postDataStr)
137         {
138             HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
139             request.Method = "GET";
140             HttpWebResponse response;
141             request.ContentType = "text/html;charset=UTF-8";
142             try
143             {
144                 response = (HttpWebResponse)request.GetResponse();
145             }
146             catch (WebException ex)
147             {
148                 response = (HttpWebResponse)request.GetResponse();
149             }
150 
151             Stream myResponseStream = response.GetResponseStream();
152             StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
153             string retString = myStreamReader.ReadToEnd();
154             myStreamReader.Close();
155             myResponseStream.Close();
156 
157             return retString;
158         }
159     }
160 }
View Code

补充:

有人说NSoup解析html更方便,可能不太会用。DLL下载地址http://nsoup.codeplex.com/

 1 NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(html);
 2 //获取小说名字
 3 //<meta name="keywords" content="无疆,无疆最新章节,无疆全文阅读"/>
 4 //获取meta
 5 NSoup.Select.Elements ele = doc.GetElementsByTag("meta");
 6 string name = "";
 7 foreach (var i in ele)
 8 {
 9     if (i.Attr("name") == "keywords")
10     {
11         name = i.Attr("content").ToString();
12     }
13 }
14 //获取章节
15 NSoup.Select.Elements eleChapter = doc.GetElementsByTag("table");//查找table,获取table里的html
16 NSoup.Nodes.Document docChild = NSoup.NSoupClient.Parse(eleChapter.ToString());
17 NSoup.Select.Elements eleChild = docChild.GetElementsByTag("a");//查找a标签
18 //循环目录,获取正文内容
19 foreach (var j in eleChild)
20 {
21     string title = j.Text();//获取章节标题
22 
23     string htmlChild = cra.HttpGet(j.Attr("href").ToString(), "");//获取文章内容
24 }
View Code

 原文:http://www.cnblogs.com/cang12138/p/7464226.html

posted @ 2017-09-07 18:54  Cein  阅读(1309)  评论(0编辑  收藏  举报