小说下载爬虫(一)
C# 程序
小说爬虫的核心在于对小说下载页面的解析和自动适配解析规则
解析规则文件
BookSiteConfig.ini (json格式)
[ { "BaseUrl": "小说所在的网站地址 不要http和https", "CatalogPage": { "BookNameNode": { "SelectName": "小说名称节点 查询的名称 查询方式SelectType有:ById 0,ByName 1, ByClass 2,Select 3", "SelectType": 0 }, "BookNameText": { "TextFormat": null, "TextRegexStr": "取小说名称的正则" }, "ChapterPageNode": { "SelectName": "小说章节目录节点", "SelectType": 0 }, "ChapterUrl": { "TextFormat": "下载连接前面拼接的网址{0}", "TextRegexStr": "取下载连接的正则" }, "PageUrlCheckStr": "目录页面校验正则", "SkipCount": 0 }, "ChapterPage": { "ChapterTitleNode": { "SelectName": "小说章节标题节点", "SelectType": 2 }, "ChapterTitleText": { "TextFormat": null, "TextRegexStr": "取章节标题的正则" }, "ContentNode": { "SelectName": "content", "SelectType": 0 }, "ExcludeRegexStr": "正文中排除的字段(包含该字段的整行都排除)" }, "RootUrl": "小说所在的网站地址" }, { "BaseUrl": "www.biquge.cz", "CatalogPage": { "BookNameNode": { "SelectName": "info", "SelectType": 0 }, "BookNameText": { "TextFormat": null, "TextRegexStr": "(?<=<h1>).*?(?=</h1>)" }, "ChapterPageNode": { "SelectName": "list", "SelectType": 0 }, "ChapterUrl": { "TextFormat": "http://www.biquge.cz{0}", "TextRegexStr": "(?<=<a href=\").*?(?=\">)" }, "PageUrlCheckStr": "http://www.biquge.cz/\\d{1,4}/\\d{1,6}/", "SkipCount": 0 }, "ChapterPage": { "ChapterTitleNode": { "SelectName": "bookname", "SelectType": 2 }, "ChapterTitleText": { "TextFormat": null, "TextRegexStr": "(?<=<h1>).*?(?=</h1>)" }, "ContentNode": { "SelectName": "content", "SelectType": 0 }, "ExcludeRegexStr": "biquge.cz" }, "RootUrl": "http://www.biquge.cz" }, { "BaseUrl": "www.biquge.com.tw", "CatalogPage": { "BookNameNode": { "SelectName": "info", "SelectType": 0 }, "BookNameText": { "TextFormat": null, "TextRegexStr": "(?<=<h1>).*?(?=</h1>)" }, "ChapterPageNode": { "SelectName": "list", "SelectType": 0 }, "ChapterUrl": { "TextFormat": "http://www.biquge.com.tw{0}", "TextRegexStr": "(?<=<a href=\").*?(?=\">)" }, "PageUrlCheckStr": "http://www.biquge.com.tw/\\d{1,4}_\\d{1,6}/", "SkipCount": 0 }, "ChapterPage": { "ChapterTitleNode": { "SelectName": "bookname", "SelectType": 2 }, "ChapterTitleText": { "TextFormat": null, "TextRegexStr": "(?<=<h1>).*?(?=</h1>)" }, "ContentNode": { "SelectName": "content", "SelectType": 0 }, "ExcludeRegexStr": "biquge.com" }, "RootUrl": "http://www.biquge.com.tw" }, { "BaseUrl": "www.biquge.com", "CatalogPage": { "BookNameNode": { "SelectName": "info", "SelectType": 0 }, "BookNameText": { "TextFormat": null, "TextRegexStr": "(?<=<h1>).*?(?=</h1>)" }, "ChapterPageNode": { "SelectName": "list", "SelectType": 0 }, "ChapterUrl": { "TextFormat": "https://www.biqudu.com{0}", "TextRegexStr": "(?<=<a href=\").*?(?=\">)" }, "PageUrlCheckStr": "https://www.biqudu.com/\\d{1,4}_\\d{1,6}/", "SkipCount": 9 }, "ChapterPage": { "ChapterTitleNode": { "SelectName": "bookname", "SelectType": 2 }, "ChapterTitleText": { "TextFormat": null, "TextRegexStr": "(?<=<h1>).*?(?=</h1>)" }, "ContentNode": { "SelectName": "content", "SelectType": 0 }, "ExcludeRegexStr": "biquge.com" }, "RootUrl": "https://www.biqudu.com" } ]
第一步骤,将规则解析成实体
private static List<BookSiteEntity> GetBookEntity() { List<BookSiteEntity> list = new List<BookSiteEntity>(); string _filePath = AppDomain.CurrentDomain.BaseDirectory + "BookSiteConfig.ini"; if(File.Exists(_filePath)) { string jsonString = File.ReadAllText(_filePath); DataContractJsonSerializer serializer = new DataContractJsonSerializer(list.GetType()); MemoryStream mStream = new MemoryStream(Encoding.UTF8.GetBytes(jsonString)); list = serializer.ReadObject(mStream) as List<BookSiteEntity>; } return list; }
using System; using System.Collections.Generic; using System.Linq; using System.Text; /// <summary> /// 页面类 /// </summary> public class BookSiteEntity { string rootUrl; /// <summary> /// 网站Url /// </summary> public string RootUrl { get { return rootUrl; } set { rootUrl = value; } } string baseUrl; public string BaseUrl { get { return baseUrl; } set { baseUrl = value; } } public bool CheckUrl(string text) { return text.Contains(BaseUrl); } CatalogPage catalogPage; /// <summary> /// 目录 /// </summary> public CatalogPage CatalogPage { get { return catalogPage; } set { catalogPage = value; } } ChapterPage chapterPage; /// <summary> /// 章节页面 /// </summary> public ChapterPage ChapterPage { get { return chapterPage; } set { chapterPage = value; } } } public enum SelectType { ById, ByName, ByClass, Select, ByRegex }
第二步骤,匹配目录页面是否是存在对应的解析规则
List<BookSiteEntity> bookSites = GetBookEntity(); BookSiteEntity bookSiteEntity = null; foreach(BookSiteEntity booksite in bookSites) { if(booksite.CatalogPage!=null&&!string.IsNullOrEmpty( booksite.CatalogPage.PageUrlCheckStr)) { if(booksite.CatalogPage.IsCatalogPageUrl(url)) { bookSiteEntity = booksite; break; } } }
第三步骤,根据规则下载和解析文件
if(bookSiteEntity!=null) { string html = HtmlHelper.GetHtml(url); NSoup.Nodes.Document document = NSoup.NSoupClient.Parse(html); string bookName = bookSiteEntity.CatalogPage.GetBookName(document); List<string> chapterLinks = bookSiteEntity.CatalogPage.GetChapterLinks(document); //开始下载 //准备开始下载写入文件 string filePath = AppDomain.CurrentDomain.BaseDirectory + bookName+ ".txt"; if(File.Exists(filePath)) { File.Delete(filePath); } DownloadToFile download = new DownloadToFile(filePath, 50, chapterLinks, bookSiteEntity.ChapterPage); download.Start(); Thread thread = new Thread(GetRunLog); thread.Start(download); } else { MessageBox.Show("未找到匹配的下载模板,无法下载"); }
其他核心代码:
using NSoup.Nodes; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; /// <summary> /// 目录类 /// </summary> public class CatalogPage { string pageUrlCheckStr; /// <summary> /// 目录页面校验字符串(正则表达式) /// </summary> public string PageUrlCheckStr { get { return pageUrlCheckStr; } set { pageUrlCheckStr = value; } } public bool IsCatalogPageUrl(string url) { Regex _is = new Regex(this.PageUrlCheckStr); return _is.IsMatch(url); } NodeElement bookNameNode; /// <summary> /// 书名节点 /// </summary> public NodeElement BookNameNode { get { return bookNameNode; } set { bookNameNode = value; } } TextElement bookNameText; /// <summary> /// 书名获取方式 /// </summary> public TextElement BookNameText { get { return bookNameText; } set { bookNameText = value; } } NodeElement chapterPageNode; /// <summary> /// 章节所在节点 /// </summary> public NodeElement ChapterPageNode { get { return chapterPageNode; } set { chapterPageNode = value; } } TextElement chapterUrl; /// <summary> /// 书名获取方式 /// </summary> public TextElement ChapterUrl { get { return chapterUrl; } set { chapterUrl = value; } } int skipCount = 0; public int SkipCount { get { return skipCount; } set { skipCount = value; } } /// <summary> /// 获取书本名称 /// </summary> /// <param name="root"></param> /// <returns></returns> public string GetBookName(Element root) { if(bookNameNode!=null&&bookNameNode.GetElement(root)!=null) { return BookNameText.getText(BookNameNode.GetElement(root).OuterHtml()); } return null; } /// <summary> /// 获得章节链接 /// </summary> /// <returns></returns> public List<string> GetChapterLinks(Element root) { if (ChapterPageNode != null && ChapterPageNode.GetElement(root) != null) { List<string> list= chapterUrl.getListText(ChapterPageNode.GetElement(root).OuterHtml()); if(list.Count<SkipCount) { return list.Skip(list.Count/2).ToList(); }else{ return list.Skip(SkipCount).ToList(); } } return null; } }
using NSoup.Nodes; using System; using System.Collections.Generic; using System.Linq; using System.Text; /// <summary> /// 章节类 /// </summary> public class ChapterPage { NodeElement chapterTitleNode; public NodeElement ChapterTitleNode { get { return chapterTitleNode; } set { chapterTitleNode = value; } } TextElement chapterTitleText; /// <summary> /// 章节标题 /// </summary> public TextElement ChapterTitleText { get { return chapterTitleText; } set { chapterTitleText = value; } } public string GetChapterTitle(Element root) { if (ChapterTitleNode != null && ChapterTitleNode.GetElement(root) != null) { return ChapterTitleText.getText(ChapterTitleNode.GetElement(root).OuterHtml()); } return null; } NodeElement contentNode; public NodeElement ContentNode { get { return contentNode; } set { contentNode = value; } } string excludeRegexStr; /// <summary> /// 排除包含该字符串的段落 /// </summary> public string ExcludeRegexStr { get { return excludeRegexStr; } set { excludeRegexStr = value; } } }
using NSoup.Nodes; using NSoup.Select; using System; using System.Collections.Generic; using System.Linq; using System.Text; /// <summary> /// 节点类 /// </summary> public class NodeElement { SelectType selectType;//节点查找的方式 /// <summary> /// 查找目录的方式 /// </summary> public SelectType SelectType { get { return selectType; } set { selectType = value; } } string selectName;//查找的名称 /// <summary> /// 查找的名称 /// </summary> public string SelectName { get { return selectName; } set { selectName = value; } } public Element GetElement(Element root) { Element element = null; switch (this.SelectType) { case SelectType.ById: element = root.GetElementById(this.SelectName); break; case SelectType.ByClass: Elements elements = root.GetElementsByClass(this.SelectName); if (elements != null && elements.Count > 0) { element = elements[0]; } break; case SelectType.Select: default: Elements sels = root.Select(this.SelectName); if (sels != null && sels.Count > 0) { element = sels[0]; } break; } return element; } public Elements GetElements(Element root) { Elements elements = null; switch (this.SelectType) { case SelectType.ById: Element element = root.GetElementById(this.SelectName); elements = new Elements(); elements.Insert(0, element); break; case SelectType.ByClass: elements = root.GetElementsByClass(this.SelectName); break; case SelectType.Select: default: elements = root.Select(this.SelectName); break; } return elements; } }
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; public class TextElement { string textRegexStr;//获取a标签的正则 public string TextRegexStr { get { return textRegexStr; } set { textRegexStr = value; } } string textFormat;//a标签中包含root public string TextFormat { get { return textFormat; } set { textFormat = value; } } public string getText(string str) { Regex _is = new Regex(this.TextRegexStr); Match m=_is.Match(str); if (m != null && m.Groups.Count > 0) { string Value = m.Groups[0].Value; if (!String.IsNullOrEmpty(TextFormat)) { Value = string.Format(TextFormat, Value); } return Value; } return null; } public List<string> getListText(string str) { List<string> result = new List<string>(); Regex _is = new Regex(this.TextRegexStr); MatchCollection m = _is.Matches(str); foreach (Match group in m) { string Value = group.Groups[0].Value; if (!String.IsNullOrEmpty(TextFormat)) { Value = string.Format(TextFormat, Value); } result.Add(Value); } return result; } }
网页下载类,每个人都有自己的下载类,可以按照自己的需求修改
using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text; public class HtmlHelper { public static string GetHtml(string priceUrl) { loop: HttpWebResponse response = null; System.IO.StreamReader myreader = null; try { var request = (HttpWebRequest)WebRequest.Create(priceUrl); request.Accept = "*/*"; request.ContentType = "text/html;charset=GBK"; request.AllowAutoRedirect = false; request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"; request.Timeout = 30 * 1000; request.KeepAlive = false; request.Method = "GET"; response = (HttpWebResponse)request.GetResponse(); myreader = new System.IO.StreamReader(response.GetResponseStream(), Encoding.Default); string responseText = myreader.ReadToEnd(); myreader.Close(); response.Close(); return responseText; } catch { if (response != null) { response.Close(); } if (myreader != null) { myreader.Close(); } goto loop; } } }
小说txt文件生成类:
using NSoup.Nodes; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading; namespace bookDown { public class DownloadToFile { private int threadCount; private List<string> urls; private string filePath; private int indexNum=0; private int indexMax = 0; private Dictionary<int, string> dic; private ChapterPage chapterPage; private List<Thread> threads = new List<Thread>(); public bool isOver() { return indexNum == indexMax; } public string getLog() { if (indexNum == indexMax && indexNum != 0) { return "下载完成"; }else { return "当前完成" + indexNum + "个下载任务,还剩余" + (indexMax - indexNum); } } public DownloadToFile(string _filePath, int _threadCount, List<string> _urls, ChapterPage _chapterPage) { if (_threadCount>100) { threadCount = 100; } else if (_threadCount <0) { threadCount = 50; } else { threadCount = _threadCount; } urls = _urls; indexNum = 1; filePath = _filePath; indexMax = urls.Count; dic = new Dictionary<int, string>(); chapterPage = _chapterPage; } public void Start() { for (int i = 0; i < threadCount; i++) { Thread thread = new Thread(GetText); thread.Name = i.ToString(); thread.Start(i); threads.Add(thread); } Thread threadCreateBook = new Thread(CreateBookFile); threadCreateBook.Start(filePath); } public void Stop() { } private void CreateBookFile(object obj) { string _filePath = obj.ToString(); FileStream fs = null; StreamWriter sw = null; //写入文件 fs = new FileStream(_filePath, FileMode.OpenOrCreate); sw = new StreamWriter(fs, Encoding.Default); while (indexMax > indexNum) { string text = RemoveDic(indexNum); if (text != null) { lock (sw) { sw.Write(text); } indexNum++; } } //清空缓冲区 sw.Flush(); //关闭流 sw.Close(); fs.Close(); foreach(Thread thread in threads) { thread.Abort(); } Thread.CurrentThread.Abort(); } private void GetText(object obj) { int threadIndex=(int)obj; //string num = Thread.CurrentThread.Name; int num = 0; for (int i = 1; i <= indexMax ; i++) { // threadCount + (indexMax / threadCount != 0 ? 1 : 0) if(i%threadCount!=threadIndex) { continue; } int index = i; if(index<=indexMax) { string url=urls[index-1]; string html = HtmlHelper.GetHtml(url); NSoup.Nodes.Document document = NSoup.NSoupClient.Parse(html); StringBuilder sb = new StringBuilder(); Element body = document.Body; //取title string title = chapterPage.GetChapterTitle(body); title = "\r\n" + title.Trim() + "\r\n"; sb.Append(title); //取正文 Element content = chapterPage.ContentNode.GetElement(body); IList<Node> nodes= content.ChildNodes; Regex paichu = new Regex(chapterPage.ExcludeRegexStr);//判断biqugex.com foreach(Node node in nodes) { if(node.NodeName=="#text") { string text = node.OuterHtml().Replace(" ", "").Trim(); if (!string.IsNullOrEmpty(text)&&!paichu.IsMatch(text)) { if (!string.IsNullOrEmpty(text)) { text = "\t" + text + "\r\n"; sb.Append(text); } } } } AddDic(index, sb.ToString()); } } } private string GetStr(Regex regex, string str, string key) { Match match = regex.Match(str.Replace("\r\n", "").Replace("\n", "")); string title = match.Groups[key].Value; return title; } private void AddDic(int index,string text) { lock (dic) { dic.Add(index, text); } } private string RemoveDic(int index) { string text = null; lock (dic) { if (dic.ContainsKey(index)) { text = dic[index]; dic.Remove(index); } } return text; } } }
共享出该代码主要目的是,希望可以通过大家的力量丰富规则解析文件,可以适配更多的网站

浙公网安备 33010602011771号