小说下载爬虫(一)

C# 程序

小说爬虫的核心在于对小说下载页面的解析和自动适配解析规则

解析规则文件

BookSiteConfig.ini  (json格式)

[
     {
        "BaseUrl": "小说所在的网站地址 不要http和https",
        "CatalogPage": {
            "BookNameNode": {
                "SelectName": "小说名称节点 查询的名称 查询方式SelectType有:ById  0,ByName 1, ByClass 2,Select 3",
                "SelectType": 0
            },
            "BookNameText": {
                "TextFormat": null,
                "TextRegexStr": "取小说名称的正则"
            },
            "ChapterPageNode": {
                "SelectName": "小说章节目录节点",
                "SelectType": 0
            },
            "ChapterUrl": {
                "TextFormat": "下载连接前面拼接的网址{0}",
                "TextRegexStr": "取下载连接的正则"
            },
            "PageUrlCheckStr": "目录页面校验正则",
            "SkipCount": 0
        },
        "ChapterPage": {
            "ChapterTitleNode": {
                "SelectName": "小说章节标题节点",
                "SelectType": 2
            },
            "ChapterTitleText": {
                "TextFormat": null,
                "TextRegexStr": "取章节标题的正则"
            },
            "ContentNode": {
                "SelectName": "content",
                "SelectType": 0
            },
            "ExcludeRegexStr": "正文中排除的字段(包含该字段的整行都排除)"
        },
        "RootUrl": "小说所在的网站地址"
    },
    {
        "BaseUrl": "www.biquge.cz",
        "CatalogPage": {
            "BookNameNode": {
                "SelectName": "info",
                "SelectType": 0
            },
            "BookNameText": {
                "TextFormat": null,
                "TextRegexStr": "(?<=<h1>).*?(?=</h1>)"
            },
            "ChapterPageNode": {
                "SelectName": "list",
                "SelectType": 0
            },
            "ChapterUrl": {
                "TextFormat": "http://www.biquge.cz{0}",
                "TextRegexStr": "(?<=<a href=\").*?(?=\">)"
            },
            "PageUrlCheckStr": "http://www.biquge.cz/\\d{1,4}/\\d{1,6}/",
            "SkipCount": 0
        },
        "ChapterPage": {
            "ChapterTitleNode": {
                "SelectName": "bookname",
                "SelectType": 2
            },
            "ChapterTitleText": {
                "TextFormat": null,
                "TextRegexStr": "(?<=<h1>).*?(?=</h1>)"
            },
            "ContentNode": {
                "SelectName": "content",
                "SelectType": 0
            },
            "ExcludeRegexStr": "biquge.cz"
        },
        "RootUrl": "http://www.biquge.cz"
    },
    {
        "BaseUrl": "www.biquge.com.tw",
        "CatalogPage": {
            "BookNameNode": {
                "SelectName": "info",
                "SelectType": 0
            },
            "BookNameText": {
                "TextFormat": null,
                "TextRegexStr": "(?<=<h1>).*?(?=</h1>)"
            },
            "ChapterPageNode": {
                "SelectName": "list",
                "SelectType": 0
            },
            "ChapterUrl": {
                "TextFormat": "http://www.biquge.com.tw{0}",
                "TextRegexStr": "(?<=<a href=\").*?(?=\">)"
            },
            "PageUrlCheckStr": "http://www.biquge.com.tw/\\d{1,4}_\\d{1,6}/",
            "SkipCount": 0
        },
        "ChapterPage": {
            "ChapterTitleNode": {
                "SelectName": "bookname",
                "SelectType": 2
            },
            "ChapterTitleText": {
                "TextFormat": null,
                "TextRegexStr": "(?<=<h1>).*?(?=</h1>)"
            },
            "ContentNode": {
                "SelectName": "content",
                "SelectType": 0
            },
            "ExcludeRegexStr": "biquge.com"
        },
        "RootUrl": "http://www.biquge.com.tw"
    },
    {
        "BaseUrl": "www.biquge.com",
        "CatalogPage": {
            "BookNameNode": {
                "SelectName": "info",
                "SelectType": 0
            },
            "BookNameText": {
                "TextFormat": null,
                "TextRegexStr": "(?<=<h1>).*?(?=</h1>)"
            },
            "ChapterPageNode": {
                "SelectName": "list",
                "SelectType": 0
            },
            "ChapterUrl": {
                "TextFormat": "https://www.biqudu.com{0}",
                "TextRegexStr": "(?<=<a href=\").*?(?=\">)"
            },
            "PageUrlCheckStr": "https://www.biqudu.com/\\d{1,4}_\\d{1,6}/",
            "SkipCount": 9
        },
        "ChapterPage": {
            "ChapterTitleNode": {
                "SelectName": "bookname",
                "SelectType": 2
            },
            "ChapterTitleText": {
                "TextFormat": null,
                "TextRegexStr": "(?<=<h1>).*?(?=</h1>)"
            },
            "ContentNode": {
                "SelectName": "content",
                "SelectType": 0
            },
            "ExcludeRegexStr": "biquge.com"
        },
        "RootUrl": "https://www.biqudu.com"
    }
]
解析规则

 

第一步骤,将规则解析成实体

        private static List<BookSiteEntity> GetBookEntity()
        {
            List<BookSiteEntity> list = new List<BookSiteEntity>();
            string _filePath = AppDomain.CurrentDomain.BaseDirectory  + "BookSiteConfig.ini";
            if(File.Exists(_filePath))
            {
                string jsonString = File.ReadAllText(_filePath);
                DataContractJsonSerializer serializer = new DataContractJsonSerializer(list.GetType());
                MemoryStream mStream = new MemoryStream(Encoding.UTF8.GetBytes(jsonString));
                list = serializer.ReadObject(mStream) as List<BookSiteEntity>;
            }

            return list;
        }
规则转实体
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

 /// <summary>
 /// 页面类
 /// </summary>
 
        public class BookSiteEntity
        {
            string rootUrl;
            /// <summary>
            /// 网站Url
            /// </summary>
            public string RootUrl
            {
                get { return rootUrl; }
                set { rootUrl = value; }
            }

            string baseUrl;

            public string BaseUrl
            {
                get { return baseUrl; }
                set { baseUrl = value; }
            }
            public bool CheckUrl(string text)
            {
                return text.Contains(BaseUrl);
            }

            CatalogPage catalogPage;
            /// <summary>
            /// 目录
            /// </summary>
            public CatalogPage CatalogPage
            {
                get { return catalogPage; }
                set { catalogPage = value; }
            }

            ChapterPage chapterPage;
            /// <summary>
            /// 章节页面
            /// </summary>
            public ChapterPage ChapterPage
            {
                get { return chapterPage; }
                set { chapterPage = value; }
            }

        }
        public enum SelectType
        {
            ById,
            ByName,
            ByClass,
            Select,
            ByRegex
        }
BookSiteEntity

 

第二步骤,匹配目录页面是否是存在对应的解析规则

            List<BookSiteEntity> bookSites = GetBookEntity();
            BookSiteEntity bookSiteEntity = null;
            foreach(BookSiteEntity booksite in bookSites)
            {
                if(booksite.CatalogPage!=null&&!string.IsNullOrEmpty( booksite.CatalogPage.PageUrlCheckStr))
                {
                    if(booksite.CatalogPage.IsCatalogPageUrl(url))
                    {
                        bookSiteEntity = booksite;
                        break; 
                    }
                }
            }
匹配规则

第三步骤,根据规则下载和解析文件

            if(bookSiteEntity!=null)
            {
                string html = HtmlHelper.GetHtml(url);
                NSoup.Nodes.Document document = NSoup.NSoupClient.Parse(html);
                string bookName = bookSiteEntity.CatalogPage.GetBookName(document);
                
                List<string> chapterLinks = bookSiteEntity.CatalogPage.GetChapterLinks(document);
                //开始下载
               //准备开始下载写入文件
               string filePath = AppDomain.CurrentDomain.BaseDirectory + bookName+ ".txt";

                if(File.Exists(filePath))
                {
                    File.Delete(filePath);
                }

               DownloadToFile download = new DownloadToFile(filePath, 50, chapterLinks, bookSiteEntity.ChapterPage);
               download.Start();
               Thread thread = new Thread(GetRunLog);
               thread.Start(download);
            }
            else
            {
                MessageBox.Show("未找到匹配的下载模板,无法下载");
            }
开始下载

 

 

其他核心代码:

using NSoup.Nodes;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

 /// <summary>
 /// 目录类
 /// </summary>
    public class CatalogPage
    {

        string pageUrlCheckStr;
        /// <summary>
        /// 目录页面校验字符串(正则表达式)
        /// </summary>
        public string PageUrlCheckStr
        {
            get { return pageUrlCheckStr; }
            set { pageUrlCheckStr = value; }
        }

        public bool IsCatalogPageUrl(string url)
        {
            Regex _is = new Regex(this.PageUrlCheckStr);
            return _is.IsMatch(url);
        }

        NodeElement bookNameNode;
        /// <summary>
        /// 书名节点
        /// </summary>
        public NodeElement BookNameNode
        {
            get { return bookNameNode; }
            set { bookNameNode = value; }
        }

        TextElement bookNameText;

        /// <summary>
        /// 书名获取方式
        /// </summary>
        public TextElement BookNameText
        {
            get { return bookNameText; }
            set { bookNameText = value; }
        }

        NodeElement chapterPageNode;
        /// <summary>
        /// 章节所在节点
        /// </summary>
        public NodeElement ChapterPageNode
        {
            get { return chapterPageNode; }
            set { chapterPageNode = value; }
        }


        TextElement chapterUrl;

        /// <summary>
        /// 书名获取方式
        /// </summary>
        public TextElement ChapterUrl
        {
            get { return chapterUrl; }
            set { chapterUrl = value; }
        }

        int skipCount = 0;

        public int SkipCount
        {
            get { return skipCount; }
            set { skipCount = value; }
        }
 

        /// <summary>
        /// 获取书本名称
        /// </summary>
        /// <param name="root"></param>
        /// <returns></returns>
        public string GetBookName(Element root)
        {
            if(bookNameNode!=null&&bookNameNode.GetElement(root)!=null)
            {
                return BookNameText.getText(BookNameNode.GetElement(root).OuterHtml());
            }
           return  null;
        }



        /// <summary>
        /// 获得章节链接
        /// </summary>
        /// <returns></returns>
        public List<string> GetChapterLinks(Element root)
        {
            if (ChapterPageNode != null && ChapterPageNode.GetElement(root) != null)
            {
               List<string> list= chapterUrl.getListText(ChapterPageNode.GetElement(root).OuterHtml());
                if(list.Count<SkipCount)
                {
                    return list.Skip(list.Count/2).ToList();
                }else{
                    return list.Skip(SkipCount).ToList();
                }
                
            }

            return null;
        }

    }
CatalogPage.cs
using NSoup.Nodes;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

 /// <summary>
 /// 章节类
 /// </summary>
public class ChapterPage
{
    NodeElement chapterTitleNode;

    public NodeElement ChapterTitleNode
    {
        get { return chapterTitleNode; }
        set { chapterTitleNode = value; }
    }

    TextElement chapterTitleText;

    /// <summary>
    /// 章节标题
    /// </summary>
    public TextElement ChapterTitleText
    {
        get { return chapterTitleText; }
        set { chapterTitleText = value; }
    }


    public string GetChapterTitle(Element root)
    {
        if (ChapterTitleNode != null && ChapterTitleNode.GetElement(root) != null)
        {
            return ChapterTitleText.getText(ChapterTitleNode.GetElement(root).OuterHtml());
        }
        return null;
    }

    NodeElement contentNode;

    public NodeElement ContentNode
    {
        get { return contentNode; }
        set { contentNode = value; }
    }

    string excludeRegexStr;
    /// <summary>
    /// 排除包含该字符串的段落
    /// </summary>
    public string ExcludeRegexStr
    {
        get { return excludeRegexStr; }
        set { excludeRegexStr = value; }
    }
 

 
}
ChapterPage.cs
using NSoup.Nodes;
using NSoup.Select;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

/// <summary>
/// 节点类
/// </summary>
public class NodeElement
{
    SelectType selectType;//节点查找的方式
    /// <summary>
    /// 查找目录的方式
    /// </summary>
    public SelectType SelectType
    {
        get { return selectType; }
        set { selectType = value; }
    }

    string selectName;//查找的名称
    /// <summary>
    /// 查找的名称
    /// </summary>
    public string SelectName
    {
        get { return selectName; }
        set { selectName = value; }
    }

    public Element GetElement(Element root)
    {
        Element element = null;
        switch (this.SelectType)
        {
            case SelectType.ById:
                element = root.GetElementById(this.SelectName);
                break;
            case SelectType.ByClass:
                Elements elements = root.GetElementsByClass(this.SelectName);
                if (elements != null && elements.Count > 0)
                {
                    element = elements[0];
                }
                break;
            case SelectType.Select:
            default:
                Elements sels = root.Select(this.SelectName);
                if (sels != null && sels.Count > 0)
                {
                    element = sels[0];
                }
                break;
            
        }
        return element;
    }

    public Elements GetElements(Element root)
    {
        Elements elements = null;
        switch (this.SelectType)
        {
            case SelectType.ById:
                Element element = root.GetElementById(this.SelectName);
                elements = new Elements();
                elements.Insert(0, element);
                break;
            case SelectType.ByClass:
                 elements = root.GetElementsByClass(this.SelectName);
                
                break;
            case SelectType.Select:
            default:
                elements = root.Select(this.SelectName);
                 
                break;
        }
        return elements;
    }
}
NodeElement.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;


public class TextElement
{
    string textRegexStr;//获取a标签的正则

    public string TextRegexStr
    {
        get { return textRegexStr; }
        set { textRegexStr = value; }
    }


    string textFormat;//a标签中包含root
    public string TextFormat
    {
        
        get { return textFormat; }
        set { textFormat = value; }
    }

    public string getText(string str)
    {
        Regex _is = new Regex(this.TextRegexStr);
        Match m=_is.Match(str);
        if (m != null && m.Groups.Count > 0)
        {
            string Value = m.Groups[0].Value;
            if (!String.IsNullOrEmpty(TextFormat))
            {
                Value = string.Format(TextFormat, Value);
            }
            return Value;
        }
        return null;
    }


    public List<string> getListText(string str)
    {
        List<string> result = new List<string>();

        Regex _is = new Regex(this.TextRegexStr);
        MatchCollection m = _is.Matches(str);
        foreach (Match group in m)
        {
            string Value = group.Groups[0].Value;
            if (!String.IsNullOrEmpty(TextFormat))
            {
                Value = string.Format(TextFormat, Value);
            }
            
            result.Add(Value);
        }

        return result;
    }
}
TextElement.cs

网页下载类,每个人都有自己的下载类,可以按照自己的需求修改

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;

 
   public class HtmlHelper
    {
       public static string GetHtml(string priceUrl)
       {
       loop:
           HttpWebResponse response = null;
           System.IO.StreamReader myreader = null;
           try
           {
               var request = (HttpWebRequest)WebRequest.Create(priceUrl);
               request.Accept = "*/*";
               request.ContentType = "text/html;charset=GBK";
               request.AllowAutoRedirect = false;

               request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36";
               request.Timeout = 30 * 1000;
               request.KeepAlive = false;
               request.Method = "GET";

               response = (HttpWebResponse)request.GetResponse();

               myreader = new System.IO.StreamReader(response.GetResponseStream(), Encoding.Default);
               string responseText = myreader.ReadToEnd();
               myreader.Close();
               response.Close();
               return responseText;
           }
           catch
           {
               if (response != null)
               {
                   response.Close();
               }
               if (myreader != null)
               {
                   myreader.Close();
               }
               goto loop;
           }
       }
    }
HtmlHelper.cs

 小说txt文件生成类:

using NSoup.Nodes;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;

namespace bookDown
{
   public class DownloadToFile
    {


       private int threadCount;
       private  List<string> urls;
       private string filePath;
       private int indexNum=0;
       private int indexMax = 0;
       private  Dictionary<int, string> dic;
       private ChapterPage chapterPage;
       private List<Thread> threads = new List<Thread>();

       public bool isOver()
       {
           return indexNum == indexMax;
       }
       public string getLog()
       {
           if (indexNum == indexMax && indexNum != 0)
           {
               return "下载完成";
           }else
           {
               return "当前完成" + indexNum + "个下载任务,还剩余" + (indexMax - indexNum);
           }
       }
       public DownloadToFile(string _filePath, int _threadCount, List<string> _urls, ChapterPage _chapterPage)
       {

           if (_threadCount>100)
           {
               threadCount = 100;
           }
           else if (_threadCount <0)
           {
               threadCount = 50;
           }
           else
           {
               threadCount = _threadCount;
           }

           urls = _urls;
           indexNum = 1;
           filePath = _filePath;
           indexMax = urls.Count;
           dic = new Dictionary<int, string>();
           chapterPage = _chapterPage;
       }

       public void Start()
       {
           for (int i = 0; i < threadCount; i++)
           {
               Thread thread = new Thread(GetText);
               thread.Name = i.ToString();
               thread.Start(i);
               threads.Add(thread);
           }

           Thread threadCreateBook = new Thread(CreateBookFile);
           threadCreateBook.Start(filePath);
       }

       public void Stop()
       {
           
       }

       private void CreateBookFile(object obj)
       {
           string _filePath = obj.ToString();
           FileStream fs = null;
           StreamWriter sw = null;
           //写入文件

           fs = new FileStream(_filePath, FileMode.OpenOrCreate);
           sw = new StreamWriter(fs, Encoding.Default);
           while (indexMax > indexNum)
           {
               string text = RemoveDic(indexNum);
               if (text != null)
               {
                   lock (sw)
                   {
                       sw.Write(text);
                   }
                   indexNum++;
               }
           }
         
           //清空缓冲区
           sw.Flush();
           //关闭流
           sw.Close();
           fs.Close();
           foreach(Thread thread in threads)
           {
               thread.Abort();
           }
           Thread.CurrentThread.Abort();
       }

       private void GetText(object obj)
       {
           int threadIndex=(int)obj;
           //string num = Thread.CurrentThread.Name;
           int num = 0;
           for (int i = 1; i <= indexMax ; i++)
           {
              // threadCount + (indexMax / threadCount != 0 ? 1 : 0)
              if(i%threadCount!=threadIndex)
              {
                  continue;
              }
               int index = i;
               if(index<=indexMax)
               {
                   string  url=urls[index-1];
                   string html = HtmlHelper.GetHtml(url);
                   NSoup.Nodes.Document document = NSoup.NSoupClient.Parse(html);
                   StringBuilder sb = new StringBuilder();
                   Element body = document.Body;
                   //取title

                   string title = chapterPage.GetChapterTitle(body);

                   title = "\r\n" + title.Trim() + "\r\n";
                   sb.Append(title);

                   //取正文

                  Element content = chapterPage.ContentNode.GetElement(body);

                   IList<Node> nodes=  content.ChildNodes;
                   Regex paichu = new Regex(chapterPage.ExcludeRegexStr);//判断biqugex.com
                   foreach(Node  node in nodes)
                   {
                       if(node.NodeName=="#text")
                       {
                           string text = node.OuterHtml().Replace("&nbsp;", "").Trim();
                            
                           if (!string.IsNullOrEmpty(text)&&!paichu.IsMatch(text))
                           {
                               if (!string.IsNullOrEmpty(text))
                               {
                                   text = "\t" + text + "\r\n";
                                   sb.Append(text);
                               }
                           }
                       }
                   }
                   AddDic(index, sb.ToString());
               }
           }
       }
       private string GetStr(Regex regex, string str, string key)
       {

           Match match = regex.Match(str.Replace("\r\n", "").Replace("\n", ""));
           string title = match.Groups[key].Value;
           return title;
       }
       private void  AddDic(int index,string text)
       {
           lock (dic)
           {
               dic.Add(index, text);
           }

       }

       private string RemoveDic(int index)
       {
           string text = null;
           lock (dic)
           {
               if (dic.ContainsKey(index))
               {
                   text = dic[index];
                   dic.Remove(index);
               }
           }
          return text;
       }
    }
}
DownloadToFile.txt

 

 

 

共享出该代码主要目的是,希望可以通过大家的力量丰富规则解析文件,可以适配更多的网站

 

posted @ 2018-06-14 14:54  -0.5拍  阅读(306)  评论(0)    收藏  举报