爬虫

  项目总览

  

  1,log4net

 1 <?xml version="1.0" encoding="utf-8"?>
 2 <log4net>
 3     <!-- Define some output appenders -->
 4     <appender name="rollingAppender" type="log4net.Appender.RollingFileAppender">
 5         <file value="log\log.txt" />
 6 
 7         <!--追加日志内容-->
 8         <appendToFile value="true" />
 9 
10         <!--防止多线程时不能写Log,官方说线程非安全-->
11         <lockingModel type="log4net.Appender.FileAppender+MinimalLock" />
12 
13         <!--可以为:Once|Size|Date|Composite-->
14         <!--Composite为Size和Date的组合-->
15         <rollingStyle value="Composite" />
16 
17         <!--当备份文件时,为文件名加的后缀-->
18         <datePattern value="yyyyMMdd.TXT" />
19 
20         <!--日志最大个数,都是最新的-->
21         <!--rollingStyle节点为Size时,只能有value个日志-->
22         <!--rollingStyle节点为Composite时,每天有value个日志-->
23         <maxSizeRollBackups value="20" />
24 
25         <!--可用的单位:KB|MB|GB-->
26         <maximumFileSize value="3MB" />
27 
28         <!--置为true,当前最新日志文件名永远为file节中的名字-->
29         <staticLogFileName value="true" />
30 
31         <!--输出级别在INFO和ERROR之间的日志-->
32         <filter type="log4net.Filter.LevelRangeFilter">
33             <param name="LevelMin" value="INFO" />
34             <param name="LevelMax" value="FATAL" />
35         </filter>
36 
37         <layout type="log4net.Layout.PatternLayout">
38             <conversionPattern value="%date [%thread] %-5level %logger - %message%newline"/>
39         </layout>
40     </appender>
41 
42     <!-- levels: OFF > FATAL > ERROR > WARN > INFO > DEBUG  > ALL -->
43     <root>
44         <priority value="ALL"/>
45         <level value="ALL"/>
46         <appender-ref ref="rollingAppender" />
47     </root>
48 </log4net>
View Code
 1     public class Logger
 2     {
 3         static Logger()
 4         {
 5             XmlConfigurator.Configure(new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "CfgFiles\\log4net.cfg.xml")));
 6             ILog Log = LogManager.GetLogger(typeof(Logger));
 7             Log.Info("系统初始化Logger模块");
 8         }
 9 
10         private ILog loger = null;
11         public Logger(Type type)
12         {
13             loger = LogManager.GetLogger(type);
14         }
15 
16         /// <summary>
17         /// Log4日志
18         /// </summary>
19         /// <param name="msg"></param>
20         /// <param name="ex"></param>
21         public void Error(string msg = "出现异常", Exception ex = null)
22         {
23             Console.WriteLine(msg);
24             loger.Error(msg, ex);
25         }
26 
27         /// <summary>
28         /// Log4日志
29         /// </summary>
30         /// <param name="msg"></param>
31         public void Warn(string msg)
32         {
33             Console.WriteLine(msg);
34             loger.Warn(msg);
35         }
36 
37         /// <summary>
38         /// Log4日志
39         /// </summary>
40         /// <param name="msg"></param>
41         public void Info(string msg)
42         {
43             Console.WriteLine(msg);
44             loger.Info(msg);
45         }
46 
47         /// <summary>
48         /// Log4日志
49         /// </summary>
50         /// <param name="msg"></param>
51         public void Debug(string msg )
52         {
53             Console.WriteLine(msg);
54             loger.Debug(msg);
55         }
56     }
View Code

  2,系统配置项

 1     /// <summary>
 2     /// 系统配置项
 3     /// </summary>
 4     public class Constant
 5     {
 6         /// <summary>
 7         /// 数据文件保存路径
 8         /// </summary>
 9         public static string DataPath = ConfigurationManager.AppSettings["DataPath"];
10         /// <summary>
11         /// 京东类别入口
12         /// </summary>
13         public static string JDCategoryUrl = ConfigurationManager.AppSettings["JDCategoryUrl"];
14     }
View Code

  app.config

 1 <?xml version="1.0" encoding="utf-8" ?>
 2 <configuration>
 3     <startup> 
 4         <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
 5     </startup>
 6   <appSettings>
 7     <add key="DataPath" value="D:\ruanmou\online9\20170711Advanced9Course16Crawler\Ruanmou.Crawler\Ruanmou.Crawler\bin\Debug\Data\"/>
 8     <add key="JDCategoryUrl" value="http://www.jd.com/allSort.aspx"/>
 9   </appSettings>
10   <connectionStrings>
11     <add name="mvc5" connectionString="Data Source=ElevenPC; Database=advanced9; User ID=sa; Password=Passw0rd; MultipleActiveResultSets=True" providerName="System.Data.SqlClient" />
12   </connectionStrings>
13 </configuration>
View Code

  3,Program

 1     /// <summary>
 2     /// 1 爬虫,爬虫攻防
 3     /// 2 下载html
 4     /// 3 xpath解析html,获取数据和深度抓取
 5     /// 4 不一样的属性和ajax数据的获取
 6     /// 5 多线程爬虫
 7     /// </summary>
 8     class Program
 9     {
10         private static Logger logger = new Logger(typeof(Program));
11         static void Main(string[] args)
12         {
13             try
14             {
15                 Console.WriteLine("欢迎来到.net高级班vip课程,今天是Eleven老师为大家带来的爬虫的学习");
16 
17                 #region 测试DownloadHtml
18                 string html = HttpHelper.DownloadHtml(@"https://list.jd.com/list.html?cat=9987,653,655", Encoding.UTF8);
19                 #endregion
20 
21                 #region 测试获取分类页
22                 //string html1 = HttpHelper.DownloadHtml(Constant.JDCategoryUrl, Encoding.UTF8);
23                 #endregion
24 
25 
26                 #region 测试抓取商品列表
27                 string testCategory = "{\"Id\":73,\"Code\":\"02f01s01T\",\"ParentCode\":\"02f01s\",\"Name\":\"烟机/灶具\",\"Url\":\"http://list.jd.com/list.html?cat=737,13297,1300\",\"Level\":3}";
28                 Category category = JsonConvert.DeserializeObject<Category>(testCategory);
29                 ISearch search = new CommoditySearch(category);
30                 search.Crawler();
31                 #endregion
32 
33                 #region 抓取
34                 CrawlerCenter.Handler();
35                 #endregion
36 
37             }
38             catch (Exception ex)
39             {
40                 logger.Error("异常啦,", ex);
41                 Console.WriteLine("*****************木有成功**********************");
42             }
43             Console.ReadLine();
44         }
45     }
View Code

  4,HttpHelper

 1     /// <summary>
 2     /// http://tool.sufeinet.com/HttpHelper.aspx
 3     /// </summary>
 4     public class HttpHelper
 5     {
 6         private static Logger logger = new Logger(typeof(HttpHelper));
 7 
 8         /// <summary>
 9         /// 根据url下载内容  之前是GB2312
10         /// </summary>
11         /// <param name="url"></param>
12         /// <returns></returns>
13         public static string DownloadUrl(string url)
14         {
15             return DownloadHtml(url, Encoding.UTF8);
16         }
17 
18         /// <summary>
19         /// 下载html
20         /// http://tool.sufeinet.com/HttpHelper.aspx
21         /// HttpWebRequest功能比较丰富,WebClient使用比较简单
22         /// </summary>
23         /// <param name="url"></param>
24         /// <returns></returns>
25         public static string DownloadHtml(string url, Encoding encode)
26         {
27             string html = string.Empty;
28             try
29             {
30                 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
31                 request.Timeout = 30 * 1000;//设置30s的超时
32                 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
33                 request.ContentType = "text/html; charset=utf-8";// "text/html;charset=gbk";// 
34                 //request.Host = "search.yhd.com";
35 
36                 //request.Headers.Add("Cookie", @"newUserFlag=1; guid=YFT7C9E6TMFU93FKFVEN7TEA5HTCF5DQ26HZ; gray=959782; cid=av9kKvNkAPJ10JGqM_rB_vDhKxKM62PfyjkB4kdFgFY5y5VO; abtest=31; _ga=GA1.2.334889819.1425524072; grouponAreaId=37; provinceId=20; search_showFreeShipping=1; rURL=http%3A%2F%2Fsearch.yhd.com%2Fc0-0%2Fkiphone%2F20%2F%3Ftp%3D1.1.12.0.73.Ko3mjRR-11-FH7eo; aut=5GTM45VFJZ3RCTU21MHT4YCG1QTYXERWBBUFS4; ac=57265177%40qq.com; msessionid=H5ACCUBNPHMJY3HCK4DRF5VD5VA9MYQW; gc=84358431%2C102362736%2C20001585%2C73387122; tma=40580330.95741028.1425524063040.1430288358914.1430790348439.9; tmd=23.40580330.95741028.1425524063040.; search_browse_history=998435%2C1092925%2C32116683%2C1013204%2C6486125%2C38022757%2C36224528%2C24281304%2C22691497%2C26029325; detail_yhdareas=""; cart_cookie_uuid=b64b04b6-fca7-423b-b2d1-ff091d17e5e5; gla=20.237_0_0; JSESSIONID=14F1F4D714C4EE1DD9E11D11DDCD8EBA; wide_screen=1; linkPosition=search");
37 
38                 //request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
39                 //request.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
40                 //request.Headers.Add("Referer", "http://list.yhd.com/c0-0/b/a-s1-v0-p1-price-d0-f0-m1-rt0-pid-mid0-kiphone/");
41 
42                 //Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312
43 
44                 using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)//发起请求
45                 {
46                     if (response.StatusCode != HttpStatusCode.OK)
47                     {
48                         logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
49                     }
50                     else
51                     {
52                         try
53                         {
54                             StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
55                             html = sr.ReadToEnd();//读取数据
56                             sr.Close();
57                         }
58                         catch (Exception ex)
59                         {
60                             logger.Error(string.Format($"DownloadHtml抓取{url}失败"), ex);
61                             html = null;
62                         }
63                     }
64                 }
65             }
66             catch (System.Net.WebException ex)
67             {
68                 if (ex.Message.Equals("远程服务器返回错误: (306)。"))
69                 {
70                     logger.Error("远程服务器返回错误: (306)。", ex);
71                     html = null;
72                 }
73             }
74             catch (Exception ex)
75             {
76                 logger.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
77                 html = null;
78             }
79             return html;
80         }
81     }
View Code

  5,爬虫

  1     public class CrawlerCenter
  2     {
  3         private static Logger logger = new Logger(typeof(CrawlerCenter));
  4 
  5         /// <summary>
  6         /// 抓取
  7         /// </summary>
  8         public static void Handler()
  9         {
 10             Console.WriteLine("请输入Y/N进行类别表初始化确认! Y 删除Category表然后重新创建,然后抓取类型数据,N(或者其他)跳过");
 11             string input = Console.ReadLine();
 12             if (input.Equals("Y", StringComparison.OrdinalIgnoreCase))
 13             {
 14                 DBInit.InitCategoryTable();
 15                 CrawlerCategory();
 16             }
 17             else
 18             {
 19                 Console.WriteLine("你选择不初始化类别数据");
 20             }
 21             Console.WriteLine("*****************^_^**********************");
 22 
 23 
 24 
 25             Console.WriteLine("请输入Y/N进行商品数据初始化确认! Y 删除全部商品表表然后重新创建,然后抓取商品数据,N(或者其他)跳过");
 26             input = Console.ReadLine();
 27             if (input.Equals("Y", StringComparison.OrdinalIgnoreCase))
 28             {
 29                 DBInit.InitCommodityTable();
 30                 CrawlerCommodity();
 31             }
 32             Console.WriteLine("*****************^_^**********************");
 33         }
 34 
 35         private static void CrawlerCategory()
 36         {
 37             Console.WriteLine($"{ DateTime.Now} jd商品类别开始抓取 - -");
 38             ISearch search = new CategorySearch();
 39             search.Crawler();
 40         }
 41 
 42         /// <summary>
 43         /// 抓取商品
 44         /// </summary>
 45         private static void CrawlerCommodity()
 46         {
 47             Console.WriteLine($"{ DateTime.Now} jd商品开始抓取 - -");
 48             CategoryRepository categoryRepository = new CategoryRepository();
 49             List<Category> categoryList = categoryRepository.QueryListByLevel(3);
 50 
 51             List<Task> taskList = new List<Task>();
 52             TaskFactory taskFactory = new TaskFactory();
 53             foreach (Category category in categoryList)
 54             {
 55                 ISearch searcher = new CommoditySearch(category);
 56                 //searcher.Crawler();
 57                 taskList.Add(taskFactory.StartNew(searcher.Crawler));
 58                 if (taskList.Count > 15)
 59                 {
 60                     taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList();
 61                     Task.WaitAny(taskList.ToArray());
 62                 }
 63             }
 64             Task.WaitAll(taskList.ToArray());
 65             Console.WriteLine($"{ DateTime.Now} jd商品抓取全部完成 - -");
 66             CleanAll();
 67         }
 68 
 69         /// <summary>
 70         /// 清理重复数据
 71         /// </summary>
 72         private static void CleanAll()
 73         {
 74             try
 75             {
 76                 Console.WriteLine($"{ DateTime.Now} 开始清理重复数据 - -");
 77                 StringBuilder sb = new StringBuilder();
 78                 for (int i = 1; i < 31; i++)
 79                 {
 80                     sb.AppendFormat(@"DELETE FROM [dbo].[JD_Commodity_{0}] where productid IN(select productid from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1)
 81                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_{0}] group by productid,CategoryId having count(0)>1);", i.ToString("000"));
 82                 }
 83                 #region
 84                 /*
 85                  DELETE FROM [dbo].[JD_Commodity_001] where productid IN(select productid from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1)
 86                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_001] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_002] where productid IN(select productid from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1)
 87                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_002] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_003] where productid IN(select productid from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1)
 88                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_003] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_004] where productid IN(select productid from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1)
 89                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_004] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_005] where productid IN(select productid from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1)
 90                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_005] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_006] where productid IN(select productid from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1)
 91                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_006] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_007] where productid IN(select productid from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1)
 92                                 AND ID NOT IN(select max(ID) as IDv from [dbo].[JD_Commodity_007] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_008] where productid IN(select productid from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1)
 93                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_008] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_009] where productid IN(select productid from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1)
 94                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_009] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_010] where productid IN(select productid from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1)
 95                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_010] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_011] where productid IN(select productid from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1)
 96                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_011] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_012] where productid IN(select productid from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1)
 97                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_012] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_013] where productid IN(select productid from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1)
 98                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_013] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_014] where productid IN(select productid from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1)
 99                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_014] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_015] where productid IN(select productid from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1)
100                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_015] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_016] where productid IN(select productid from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1)
101                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_016] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_017] where productid IN(select productid from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1)
102                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_017] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_018] where productid IN(select productid from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1)
103                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_018] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_019] where productid IN(select productid from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1)
104                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_019] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_020] where productid IN(select productid from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1)
105                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_020] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_021] where productid IN(select productid from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1)
106                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_021] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_022] where productid IN(select productid from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1)
107                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_022] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_023] where productid IN(select productid from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1)
108                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_023] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_024] where productid IN(select productid from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1)
109                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_024] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_025] where productid IN(select productid from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1)
110                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_025] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_026] where productid IN(select productid from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1)
111                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_026] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_027] where productid IN(select productid from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1)
112                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_027] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_028] where productid IN(select productid from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1)
113                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_028] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_029] where productid IN(select productid from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1)
114                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_029] group by productid,CategoryId having count(0)>1);DELETE FROM [dbo].[JD_Commodity_030] where productid IN(select productid from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1)
115                                 AND ID NOT IN(select max(ID) as ID from [dbo].[JD_Commodity_030] group by productid,CategoryId having count(0)>1);
116                  */
117                 #endregion
118                 Console.WriteLine("执行清理sql:{0}", sb.ToString());
119                 SqlHelper.ExecuteNonQuery(sb.ToString());
120                 Console.WriteLine("{0} 完成清理重复数据 - -", DateTime.Now);
121             }
122             catch (Exception ex)
123             {
124                 logger.Error("CleanAll出现异常", ex);
125             }
126             finally
127             {
128                 Console.WriteLine("{0} 结束清理重复数据 - -", DateTime.Now);
129             }
130         }
131     }
View Code
  1     public interface ISearch
  2     {
  3         void Crawler();
  4     }
  5 
  6 
  7     /// <summary>
  8     /// 商品抓取
  9     /// http://www.w3school.com.cn/xpath/index.asp XPATH语法
 10     /// </summary>
 11     public class CommoditySearch : ISearch
 12     {
 13         private Logger logger = new Logger(typeof(CommoditySearch));
 14         private WarnRepository warnRepository = new WarnRepository();
 15         private CommodityRepository commodityRepository = new CommodityRepository();
 16         private Category category = null;
 17 
 18         public CommoditySearch(Category _category)
 19         {
 20             category = _category;
 21         }
 22 
 23         public void Crawler()
 24         {
 25             try
 26             {
 27                 if (string.IsNullOrEmpty(category.Url))
 28                 {
 29                     warnRepository.SaveWarn(category, string.Format("Url为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
 30                     return;
 31                 }
 32                 string html = HttpHelper.DownloadUrl(category.Url);//下载html
 33 
 34                 HtmlDocument doc = new HtmlDocument();
 35                 doc.LoadHtml(html);//加载html
 36                 string pageNumberPath = @"//*[@id='J_topPage']/span/i";
 37                 HtmlNode pageNumberNode = doc.DocumentNode.SelectSingleNode(pageNumberPath);
 38                 if (pageNumberNode != null)
 39                 {
 40                     string sNumber = pageNumberNode.InnerText;
 41                     for (int i = 1; i < int.Parse(sNumber) + 1; i++)
 42                     {
 43                         string pageUrl = string.Format("{0}&page={1}", category.Url, i);
 44                         try
 45                         {
 46                             List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i)));
 47                             //commodityRepository.SaveList(commodityList);
 48                         }
 49                         catch (Exception ex)//保证一页的错误不影响另外一页
 50                         {
 51                             logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex);
 52                         }
 53                     }
 54                 }
 55 
 56 
 57 
 58 
 59                 //string fristPath = "//*[@id='J_bottomPage']/span[1]/a";
 60                 //HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(fristPath);//xPath分析
 61                 //if (noneNodeList == null)
 62                 //{
 63                 //    warnRepository.SaveWarn(category, string.Format("分页数据为空,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
 64                 //    return;
 65                 //}
 66 
 67                 //string pageUrl = null;
 68                 //foreach (var node in noneNodeList)
 69                 //{
 70                 //    string sNum = node.InnerHtml;
 71                 //    if (sNum.Equals("1"))
 72                 //    {
 73                 //        pageUrl = node.Attributes["href"].Value.Replace("&amp;", "&");
 74                 //        if (!pageUrl.StartsWith("http://"))
 75                 //            pageUrl = string.Format("http://list.jd.com{0}", pageUrl);
 76                 //        break;
 77                 //    }
 78                 //}
 79                 //string sMaxPageNumPath = "//*[@id='J_bottomPage']/span[2]/em[1]/b";
 80                 //HtmlNode sMaxPageNumPathNode = doc.DocumentNode.SelectSingleNode(sMaxPageNumPath);
 81                 //string sMaxPageNum = sMaxPageNumPathNode.InnerHtml;
 82                 //for (int i = 1; i < int.Parse(sMaxPageNum) + 1; i++)
 83                 //{
 84                 //    try
 85                 //    {
 86                 //        List<Commodity> commodityList = GetCommodityList(category, pageUrl.Replace("&page=1&", string.Format("&page={0}&", i)));
 87                 //        commodityRepository.SaveList(commodityList);
 88                 //    }
 89                 //    catch (Exception ex)//保证一页的错误不影响另外一页
 90                 //    {
 91                 //        logger.Error("Crawler的commodityRepository.SaveList(commodityList)出现异常", ex);
 92                 //    }
 93                 //}
 94             }
 95             catch (Exception ex)
 96             {
 97                 logger.Error("CrawlerMuti出现异常", ex);
 98                 warnRepository.SaveWarn(category, string.Format("出现异常,Name={0} Level={1} Url={2}", category.Name, category.CategoryLevel, category.Url));
 99             }
100         }
101 
102         private List<Commodity> GetCommodityList(Category category, string url)
103         {
104             string html = HttpHelper.DownloadUrl(url);
105             List<Commodity> commodityList = new List<Commodity>();
106             try
107             {
108                 if (string.IsNullOrEmpty(html)) return commodityList;
109                 HtmlDocument doc = new HtmlDocument();
110                 doc.LoadHtml(html);
111                 string liPath = "//*[@id='plist']/ul/li";
112                 HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(liPath);
113                 if (noneNodeList == null || noneNodeList.Count == 0)
114                 {
115                     warnRepository.SaveWarn(category, string.Format("GetCommodityList商品数据为空,Name={0} Level={1} category.Url={2} url={3}", category.Name, category.CategoryLevel, category.Url, url));
116                     return commodityList;
117                 }
118                 foreach (var node in noneNodeList)
119                 {
120                     HtmlDocument docChild = new HtmlDocument();
121                     docChild.LoadHtml(node.OuterHtml);
122 
123                     Commodity commodity = new Commodity()
124                     {
125                         CategoryId = category.Id
126                     };
127 
128                     string urlPath = "//*[@class='p-name']/a";
129                     HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
130                     if (urlNode == null)
131                     {
132                         continue;
133                     }
134                     commodity.Url = urlNode.Attributes["href"].Value;
135                     if (!commodity.Url.StartsWith("http:"))
136                         commodity.Url = "http:" + commodity.Url;
137 
138                     string sId = Path.GetFileName(commodity.Url).Replace(".html", "");
139                     commodity.ProductId = long.Parse(sId);
140 
141                     //*[@id="plist"]/ul/li[1]/div/div[3]/a/em
142                     string titlePath = "//*[@class='p-name']/a/em";
143                     HtmlNode titleNode = docChild.DocumentNode.SelectSingleNode(titlePath);
144                     if (titleNode == null)
145                     {
146                         //Log.Error(titlePath);
147                         continue;
148                     }
149                     commodity.Title = titleNode.InnerText;
150 
151                     string iamgePath = "//*[@class='p-img']/a/img";
152                     HtmlNode imageNode = docChild.DocumentNode.SelectSingleNode(iamgePath);
153                     if (imageNode == null)
154                     {
155                         continue;
156                     }
157                     //前后不一
158                     if (imageNode.Attributes.Contains("src"))
159                         commodity.ImageUrl = imageNode.Attributes["src"].Value;
160                     else if (imageNode.Attributes.Contains("original"))
161                         commodity.ImageUrl = imageNode.Attributes["original"].Value;
162                     else if (imageNode.Attributes.Contains("data-lazy-img"))
163                         commodity.ImageUrl = imageNode.Attributes["data-lazy-img"].Value;
164                     else
165                     {
166                         continue;
167                     }
168                     if (!commodity.ImageUrl.StartsWith("http:"))
169                         commodity.ImageUrl = "http:" + commodity.ImageUrl;
170 
171                     string pricePath = "//*[@class='p-price']/strong/i";
172                     HtmlNode priceNode = docChild.DocumentNode.SelectSingleNode(pricePath);
173                     if (priceNode == null)
174                     {
175                         continue;
176                     }
177                     else
178                     {
179                     }
180                     commodityList.Add(commodity);
181                 }
182                 Console.WriteLine("{0}一共获取了{1}条数据", url, commodityList.Count);
183             }
184             catch (Exception ex)
185             {
186                 logger.Error(string.Format("GetCommodityList出现异常,url={0}", url), ex);
187             }
188             return GetCommodityPrice(category, commodityList);
189         }
190 
191         /// <summary>
192         /// 获取商品价格
193         /// </summary>
194         /// <param name="commodityList"></param>
195         /// <returns></returns>
196         private List<Commodity> GetCommodityPrice(Category category, List<Commodity> commodityList)
197         {
198             try
199             {
200                 if (commodityList == null || commodityList.Count() == 0)
201                     return commodityList;
202 
203                 StringBuilder sb = new StringBuilder();
204                 //sb.Append(@"http://p.3.cn/prices/mgets?my=list_price&type=1&area=1_72_4137&skuIds=");
205                 //sb.Append(string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId))));
206                 //
207                 sb.AppendFormat("http://p.3.cn/prices/mgets?callback=jQuery1069298&type=1&area=1_72_4137_0&skuIds={0}&pdbp=0&pdtk=&pdpin=&pduid=1945966343&_=1469022843655", string.Join("%2C", commodityList.Select(c => string.Format("J_{0}", c.ProductId))));
208                 string html = HttpHelper.DownloadUrl(sb.ToString());
209                 if (string.IsNullOrWhiteSpace(html))
210                 {
211                     logger.Warn(string.Format("获取url={0}时获取的html为空", sb.ToString()));
212                 }
213                 html = html.Substring(html.IndexOf("(") + 1);
214                 html = html.Substring(0, html.LastIndexOf(")"));
215                 List<CommodityPrice> priceList = JsonConvert.DeserializeObject<List<CommodityPrice>>(html);
216                 commodityList.ForEach(c => c.Price = priceList.FirstOrDefault(p => p.id.Equals(string.Format("J_{0}", c.ProductId))).p);
217                 //commodityList.ForEach(c => Console.WriteLine(" Title={0}  ImageUrl={1} Url={2} Price={3} Id={4}", c.Title, c.ImageUrl, c.Url, c.Price, c.Id));
218             }
219             catch (Exception ex)
220             {
221                 logger.Error("GetCommodityPrice出现异常", ex);
222             }
223             return commodityList;
224         }
225     }
226 
227     /// <summary>
228     /// http://www.w3school.com.cn/xpath/index.asp XPATH语法
229     /// </summary>
230     public class CategorySearch : ISearch
231     {
232         private static Logger logger = new Logger(typeof(CategorySearch));
233         private int _Count = 1;//每次都得new一个 重新初始化类别
234 
235         public void Crawler()
236         {
237             List<Category> categoryList = new List<Category>();
238             try
239             {
240                 string html = HttpHelper.DownloadUrl(Constant.JDCategoryUrl);
241 
242                 HtmlDocument doc = new HtmlDocument();
243                 doc.LoadHtml(html);
244                 string fristPath = "//*[@class='category-item m']";
245                 HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(fristPath);
246                 int k = 1;
247                 foreach (HtmlNode node in nodeList)
248                 {
249                     categoryList.AddRange(this.First(node.InnerHtml, k++.ToString("00") + "f", "root"));
250                 }
251 
252                 CategoryRepository categoryRepository = new CategoryRepository();
253                 categoryRepository.Save(categoryList);
254             }
255             catch (Exception ex)
256             {
257                 logger.Error("CrawlerMuti出现异常", ex);
258             }
259             finally
260             {
261                 Console.WriteLine($"类型数据初始化完成,共抓取类别{ categoryList?.Count}个");
262             }
263         }
264 
265         /// <summary>
266         /// 对每一个一级类进行查找
267         /// </summary>
268         /// <param name="html"></param>
269         /// <param name="code"></param>
270         /// <param name="parentCode"></param>
271         /// <returns></returns>
272         private List<Category> First(string html, string code, string parentCode)
273         {
274             List<Category> categoryList = new List<Category>();
275             HtmlDocument doc = new HtmlDocument();
276             doc.LoadHtml(html);
277             string path = "//*[@class='mt']/h2/span";
278             HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
279             foreach (HtmlNode node in nodeList)
280             {
281                 Category category = new Category()
282                 {
283                     Id = _Count++,
284                     State = 0,
285                     CategoryLevel = 1,
286                     Code = code,
287                     ParentCode = parentCode
288                 };
289                 category.Name = node.InnerText;
290                 category.Url = "";// node.Attributes["href"].Value;
291                 categoryList.Add(category);
292             }
293             categoryList.AddRange(this.Second(html, code));
294             return categoryList;
295         }
296 
297         /// <summary>
298         /// 在一个一级类下面的全部二级类进行查找
299         /// </summary>
300         /// <param name="html"></param>
301         /// <param name="parentCode"></param>
302         /// <returns></returns>
303         private List<Category> Second(string html, string parentCode)
304         {
305             List<Category> categoryList = new List<Category>();
306             HtmlDocument doc = new HtmlDocument();
307             doc.LoadHtml(html);
308             string path = "//*[@class='items']/dl";
309             HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
310             int k = 1;
311             foreach (HtmlNode node in nodeList)
312             {
313                 string code = string.Format("{0}{1}s", parentCode, k.ToString("00"));
314                 string secondHtml = node.InnerHtml;
315                 if (string.IsNullOrWhiteSpace(secondHtml)) continue;
316                 HtmlDocument secondDoc = new HtmlDocument();
317                 secondDoc.LoadHtml(secondHtml);
318                 Category category = new Category()
319                 {
320                     Id = _Count++,
321                     State = 0,
322                     CategoryLevel = 2,
323                     Code = code,
324                     ParentCode = parentCode
325                 };
326 
327 
328                 HtmlNode secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt/a");
329                 if (secondNode == null)//图书音像
330                 {
331                     secondNode = secondDoc.DocumentNode.SelectSingleNode("//dt");
332                 }
333                 category.Name = secondNode.InnerText;
334                 if (secondNode.Attributes["href"] != null)
335                 {
336                     category.Url = secondNode.Attributes["href"].Value;
337                     if (!category.Url.StartsWith("http:"))
338                     {
339                         category.Url = string.Concat("http:", category.Url);
340                     }
341                 }
342                 categoryList.Add(category);
343                 HtmlNode thirdNode = secondDoc.DocumentNode.SelectSingleNode("//dd");
344                 if (thirdNode == null) continue;
345                 categoryList.AddRange(this.Third(thirdNode.InnerHtml, code));
346                 k++;
347             }
348             return categoryList;
349         }
350 
351         /// <summary>
352         /// 在一个二级类下的全部三级类里面进行查找
353         /// </summary>
354         /// <param name="html"></param>
355         /// <param name="parentCode"></param>
356         /// <returns></returns>
357         private List<Category> Third(string html, string parentCode)
358         {
359             List<Category> categoryList = new List<Category>();
360             HtmlDocument doc = new HtmlDocument();
361             doc.LoadHtml(html);
362             string path = "//a";
363             HtmlNodeCollection nodeList = doc.DocumentNode.SelectNodes(path);
364             if (nodeList == null || nodeList.Count == 0) return categoryList;
365             int k = 1;
366             foreach (HtmlNode node in nodeList)
367             {
368                 string code = string.Format("{0}{1}t", parentCode, k.ToString("00"));
369                 Category category = new Category()
370                 {
371                     Id = _Count++,
372                     State = 0,
373                     CategoryLevel = 3,
374                     Code = code,
375                     ParentCode = parentCode
376                 };
377                 category.Name = node.InnerText;
378                 category.Url = node.Attributes["href"].Value;
379                 if (!category.Url.StartsWith("http:"))
380                 {
381                     category.Url = string.Concat("http:", category.Url);
382                 }
383                 categoryList.Add(category);
384                 k++;
385             }
386             return categoryList;
387         }
388     }
View Code

  6,Model

 1     public class BaseModel
 2     {
 3         public int Id { get; set; }
 4     }
 5 
 6     public class Category:BaseModel
 7     {
 8         public string Code { get; set; }
 9         public string ParentCode { get; set; }
10         public string Name { get; set; }
11         public string Url { get; set; }
12         public int CategoryLevel { get; set; }
13         public int State { get; set; }
14     }
15 
16     public class Commodity : BaseModel
17     {
18         public long ProductId { get; set; }
19         public int CategoryId { get; set; }
20         public string Title { get; set; }
21         public decimal Price { get; set; }
22         public string Url { get; set; }
23         public string ImageUrl { get; set; }
24     }
25 
26 
27     //jQuery5427073([{"id":"J_1707419","p":"5149.00","m":"5499.00"},{"id":"J_1589214","p":"1999.00","m":"2999.00"},{"id":"J_1546310","p":"3999.00","m":"4999.00"},{"id":"J_1510479","p":"2999.00","m":"3569.00"},{"id":"J_1707420","p":"4149.00","m":"4499.00"},{"id":"J_1770620","p":"2099.00","m":"2499.00"},{"id":"J_1258277","p":"2699.00","m":"3299.00"},{"id":"J_1707423","p":"4599.00","m":"4705.00"},{"id":"J_1252778","p":"3099.00","m":"4199.00"},{"id":"J_1553732","p":"3298.00","m":"4598.00"},{"id":"J_1576022","p":"2999.00","m":"3999.00"},{"id":"J_1420120","p":"1999.00","m":"2899.00"},{"id":"J_647948","p":"1299.00","m":"1698.00"},{"id":"J_1044476","p":"1999.00","m":"2999.00"},{"id":"J_1376591","p":"1299.00","m":"1599.00"},{"id":"J_1416294","p":"4599.00","m":"5898.00"},{"id":"J_1455427","p":"1499.00","m":"1999.00"},{"id":"J_1253502","p":"2799.00","m":"3999.00"},{"id":"J_1553624","p":"2998.00","m":"4398.00"},{"id":"J_1301951","p":"2279.00","m":"3999.00"},{"id":"J_1115374","p":"2499.00","m":"4299.00"},{"id":"J_671315","p":"1999.00","m":"2898.00"},{"id":"J_1283945","p":"3099.00","m":"4199.00"},{"id":"J_1283940","p":"2499.00","m":"2999.00"},{"id":"J_1027317","p":"2799.00","m":"5999.00"},{"id":"J_1314962","p":"3699.00","m":"5199.00"},{"id":"J_1565150","p":"4068.00","m":"5727.00"},{"id":"J_1565175","p":"3788.00","m":"5377.00"},{"id":"J_1565182","p":"3938.00","m":"5757.00"},{"id":"J_1209084","p":"3599.00","m":"4999.00"}]);
28     /// <summary>
29     /// 为解析json
30     /// </summary>
31     public class CommodityPrice
32     {
33         public string id { get; set; }
34         public decimal p { get; set; }
35         public decimal m { get; set; }
36     }
View Code

  7,DataService

  (1)SqlHelper

  1     public class SqlHelper
  2     {
  3         private static Logger logger = new Logger(typeof(SqlHelper));
  4         private static string _ConnStr = ConfigurationManager.ConnectionStrings["mvc5"].ConnectionString;
  5 
  6         /// <summary>
  7         /// 事务执行
  8         /// </summary>
  9         /// <param name="sql"></param>
 10         public static void ExecuteNonQuery(string sql)
 11         {
 12             using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
 13             {
 14                 sqlConn.Open();
 15                 SqlCommand cmd = new SqlCommand(sql, sqlConn);
 16                 cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
 17             }
 18         }
 19 
 20         public static void ExecuteNonQueryWithTrans(string sql)
 21         {
 22             SqlTransaction trans = null;
 23             try
 24             {
 25                 using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
 26                 {
 27                     sqlConn.Open();
 28                     trans = sqlConn.BeginTransaction();
 29                     SqlCommand cmd = new SqlCommand(sql, sqlConn, trans);
 30                     cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
 31                     trans.Commit();
 32                 }
 33             }
 34             catch (Exception ex)
 35             {
 36                 //logger.Error(string.Format("ExecuteNonQueryWithTrans出现异常,sql={0}", sql), ex);
 37                 if (trans != null && trans.Connection != null)
 38                     trans.Rollback();
 39                 throw ex;
 40             }
 41             finally
 42             {
 43             }
 44         }
 45 
 46         public static List<T> QueryList<T>(string sql) where T : new()
 47         {
 48             using (SqlConnection sqlConn = new SqlConnection(_ConnStr))
 49             {
 50                 sqlConn.Open();
 51                 SqlCommand cmd = new SqlCommand(sql, sqlConn);
 52                 return TransList<T>(cmd.ExecuteReader());
 53             }
 54         }
 55 
 56         public static void Insert<T>(T model, string tableName) where T : new()
 57         {
 58             string sql = GetInsertSql<T>(model, tableName);
 59             ExecuteNonQuery(sql);
 60         }
 61 
 62         public static void InsertList<T>(List<T> list, string tableName) where T : new()
 63         {
 64             string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName)));
 65             ExecuteNonQuery(sql);
 66         }
 67 
 68         #region Private
 69         private static string GetInsertSql<T>(T model, string tableName)
 70         {
 71             StringBuilder sbSql = new StringBuilder();
 72 
 73             StringBuilder sbFields = new StringBuilder();
 74             StringBuilder sbValues = new StringBuilder();
 75 
 76             Type type = model.GetType();
 77             var properties = type.GetProperties();
 78             foreach (PropertyInfo p in properties)
 79             {
 80                 string name = p.Name;
 81                 if (!name.Equals("id", StringComparison.OrdinalIgnoreCase))
 82                 {
 83                     sbFields.AppendFormat("[{0}],", name);
 84                     string sValue = null;
 85                     object oValue = p.GetValue(model);
 86                     if (oValue != null)
 87                         sValue = oValue.ToString().Replace("'", "");
 88                     sbValues.AppendFormat("'{0}',", sValue);
 89                 }
 90             }
 91             sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});", tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(','));
 92             return sbSql.ToString();
 93         }
 94 
 95         private static List<T> TransList<T>(SqlDataReader reader) where T : new()
 96         {
 97             List<T> tList = new List<T>();
 98             Type type = typeof(T);
 99             var properties = type.GetProperties();
100             if (reader.Read())
101             {
102                 do
103                 {
104                     T t = new T();
105                     foreach (PropertyInfo p in properties)
106                     {
107                         p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
108                     }
109                     tList.Add(t);
110                 }
111                 while (reader.Read());
112             }
113             return tList;
114         }
115 
116         private static T TransModel<T>(SqlDataReader reader) where T : new()
117         {
118             T t = new T();
119             if (reader.Read())
120             {
121                 do
122                 {
123                     Type type = typeof(T);
124                     var properties = type.GetProperties();
125                     foreach (PropertyInfo p in properties)
126                     {
127                         p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
128                     }
129                 }
130                 while (reader.Read());
131             }
132             return t;
133         }
134         #endregion Private
135     }
View Code
  1 namespace Ruanmou.Crawler.DataService
  2 {
  3     /// <summary>
  4     /// 数据库结构初始化
  5     /// 改进下:直接判断表是否存在,而不是等着异常
  6     /// </summary>
  7     public class DBInit
  8     {
  9         private static Logger logger = new Logger(typeof(DBInit));
 10 
 11         /// <summary>
 12         /// 谨慎使用  会全部删除数据库并重新创建!
 13         /// </summary>
 14         public static void InitCommodityTable()
 15         {
 16             #region Delete
 17             try
 18             {
 19                 StringBuilder sb = new StringBuilder();
 20                 for (int i = 1; i < 31; i++)
 21                 {
 22                     sb.AppendFormat("DROP TABLE [dbo].[JD_Commodity_{0}];", i.ToString("000"));
 23                 }
 24                 SqlHelper.ExecuteNonQuery(sb.ToString());
 25             }
 26             catch (Exception ex)
 27             {
 28                 if (ex.Message.Contains("因为它不存在,或者您没有所需的权限。"))
 29                 {
 30                     logger.Warn("初始化数据库InitCommodityTable删除的时候,原表不存在");
 31                 }
 32                 else
 33                 {
 34                     logger.Error("初始化数据库InitCommodityTable失败", ex);
 35                     throw ex;
 36                 }
 37             }
 38             #endregion Delete
 39 
 40             #region Create
 41             try
 42             {
 43                 StringBuilder sb = new StringBuilder();
 44                 for (int i = 1; i < 31; i++)
 45                 {
 46                     sb.AppendFormat(@"CREATE TABLE [dbo].[JD_Commodity_{0}](
 47                                         [Id] [int] IDENTITY(1,1) NOT NULL,
 48                                         [ProductId] [bigint] NULL,
 49                                         [CategoryId] [int] NULL,
 50                                         [Title] [nvarchar](500) NULL,
 51                                         [Price] [decimal](18, 2) NULL,
 52                                         [Url] [varchar](1000) NULL,
 53                                         [ImageUrl] [varchar](1000) NULL,
 54                              CONSTRAINT [PK_JD_Commodity_{0}] PRIMARY KEY CLUSTERED 
 55                             (
 56                                 [Id] ASC
 57                             )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
 58                             ) ON [PRIMARY];", i.ToString("000"));
 59                 }
 60                 SqlHelper.ExecuteNonQuery(sb.ToString());
 61             }
 62             catch (Exception ex)
 63             {
 64                 logger.Error("InitCommodityTable创建异常", ex);
 65                 throw ex;
 66             }
 67             #endregion Create
 68         }
 69 
 70         /// <summary>
 71         /// 谨慎使用  会全部删除数据库并重新创建!
 72         /// </summary>
 73         public static void InitCategoryTable()
 74         {
 75             #region Delete
 76             try
 77             {
 78                 StringBuilder sb = new StringBuilder();
 79                 sb.AppendFormat("DROP TABLE [dbo].[Category];");
 80                 SqlHelper.ExecuteNonQuery(sb.ToString());
 81             }
 82             catch (Exception ex)
 83             {
 84                 if (ex.Message.Equals("无法对 表 'dbo.Category' 执行 删除,因为它不存在,或者您没有所需的权限。"))
 85                 {
 86                     logger.Warn("初始化数据库InitCategoryTable删除的时候,原表不存在");
 87                 }
 88                 else
 89                 {
 90                     logger.Error("初始化数据库InitCategoryTable失败", ex);
 91                     throw ex;
 92                 }
 93             }
 94             #endregion Delete
 95 
 96             #region Create
 97             try
 98             {
 99                 StringBuilder sb = new StringBuilder();
100                 sb.AppendFormat(@"CREATE TABLE [dbo].[Category](
101                                         [Id] [int] IDENTITY(1,1) NOT NULL,
102                                         [Code] [varchar](100) NULL,
103                                         [ParentCode] [varchar](100) NULL,
104                                         [CategoryLevel] [int] NULL,
105                                         [Name] [nvarchar](50) NULL,
106                                         [Url] [varchar](1000) NULL,
107                                         [State] [int] NULL,
108                                       CONSTRAINT [PK_Category] PRIMARY KEY CLUSTERED 
109                                      (
110                                          [Id] ASC
111                                      )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
112                                      ) ON [PRIMARY];");
113 
114                 SqlHelper.ExecuteNonQuery(sb.ToString());
115             }
116             catch (Exception ex)
117             {
118                 logger.Error("初始化数据库InitCategoryTable 创建失败", ex);
119                 throw ex;
120             }
121             #endregion Create
122 
123         }
124     }
125 }
View Code
  1     public interface IRepository<T> where T : class//, new()
  2     {
  3         void Save(T entity);
  4         void SaveList(List<T> entity);
  5     }
  6 
  7     public class CommodityRepository //: IRepository<Commodity>
  8     {
  9         private Logger logger = new Logger(typeof(CommodityRepository));
 10 
 11         public void SaveList(List<Commodity> commodityList)
 12         {
 13             if (commodityList == null || commodityList.Count == 0) return;
 14             IEnumerable<IGrouping<string, Commodity>> group = commodityList.GroupBy<Commodity, string>(c => GetTableName(c));
 15 
 16             foreach (var data in group)
 17             {
 18                 SqlHelper.InsertList<Commodity>(data.ToList(), data.Key);
 19             }
 20         }
 21 
 22         private string GetTableName(Commodity commodity)
 23         {
 24             return string.Format("JD_Commodity_{0}", (commodity.ProductId % 30 + 1).ToString("000"));
 25         }
 26 
 27         /// <summary>
 28         /// 保存文本记录
 29         /// </summary>
 30         /// <param name="commodityList"></param>
 31         /// <param name="category"></param>
 32         /// <param name="page"></param>
 33         public void SaveList(List<Commodity> commodityList, Category category, int page)
 34         {
 35             StreamWriter sw = null;
 36             try
 37             {
 38                 string recordFileName = string.Format($"{category.CategoryLevel}/{category.ParentCode}/{category.Id}/{page}.txt");
 39                 string totolPath = Path.Combine(Constant.DataPath, recordFileName);
 40                 if (!Directory.Exists(Path.GetDirectoryName(totolPath)))
 41                 {
 42                     Directory.CreateDirectory(Path.GetDirectoryName(totolPath));
 43                     sw = File.CreateText(totolPath);
 44                 }
 45                 else
 46                 {
 47                     sw = File.AppendText(totolPath);
 48                 }
 49                 sw.WriteLine(JsonConvert.SerializeObject(commodityList));
 50             }
 51             catch (Exception e)
 52             {
 53                 logger.Error("CommodityRepository.SaveList出现异常", e);
 54             }
 55             finally
 56             {
 57                 if (sw != null)
 58                 {
 59                     sw.Flush();
 60                     sw.Close();
 61                     sw.Dispose();
 62                 }
 63             }
 64         }
 65     }
 66 
 67 
 68     public class CategoryRepository //: IRepository<Commodity>
 69     {
 70         private Logger logger = new Logger(typeof(CategoryRepository));
 71 
 72         public void Save(List<Category> categoryList)
 73         {
 74             SqlHelper.InsertList<Category>(categoryList, "Category");
 75             new Action<List<Category>>(SaveList).BeginInvoke(categoryList, null, null);
 76         }
 77 
 78         /// <summary>
 79         /// 根据Level获取类别列表
 80         /// </summary>
 81         /// <param name="level"></param>
 82         /// <returns></returns>
 83         public List<Category> QueryListByLevel(int level)
 84         {
 85             string sql = string.Format("SELECT * FROM category WHERE categorylevel={0};", level);
 86             return SqlHelper.QueryList<Category>(sql);
 87         }
 88 
 89 
 90         /// <summary>
 91         /// 存文本记录的
 92         /// </summary>
 93         /// <param name="categoryList"></param>
 94         public void SaveList(List<Category> categoryList)
 95         {
 96             StreamWriter sw = null;
 97             try
 98             {
 99                 string recordFileName = string.Format("{0}_Category.txt", DateTime.Now.ToString("yyyyMMddHHmmss"));
100                 string totolPath = Path.Combine(Constant.DataPath, recordFileName);
101                 if (!Directory.Exists(Path.GetDirectoryName(totolPath)))
102                 {
103                     Directory.CreateDirectory(Path.GetDirectoryName(totolPath));
104                     sw = File.CreateText(totolPath);
105                 }
106                 else
107                 {
108                     sw = File.AppendText(totolPath);
109                 }
110 
111                 sw.WriteLine(JsonConvert.SerializeObject(categoryList));
112             }
113             catch (Exception e)
114             {
115                 logger.Error("CategoryRepository.SaveList出现异常", e);
116             }
117             finally
118             {
119                 if (sw != null)
120                 {
121                     sw.Flush();
122                     sw.Close();
123                     sw.Dispose();
124                 }
125             }
126         }
127     }
128 
129     public class WarnRepository //: IRepository<Commodity>
130     {
131         private Logger logger = new Logger(typeof(WarnRepository));
132         public void SaveWarn(Category category, string msg)
133         {
134             StreamWriter sw = null;
135             try
136             {
137                 string recordFileName = string.Format("warn/{0}/{1}/{2}.txt", category.CategoryLevel, category.ParentCode, category.Id);
138                 string totolPath = Path.Combine(Constant.DataPath, recordFileName);
139                 if (!Directory.Exists(Path.GetDirectoryName(totolPath)))
140                 {
141                     Directory.CreateDirectory(Path.GetDirectoryName(totolPath));
142                     sw = File.CreateText(totolPath);
143                 }
144                 else
145                 {
146                     sw = File.AppendText(totolPath);
147                 }
148                 sw.WriteLine(msg);
149                 sw.WriteLine(JsonConvert.SerializeObject(JsonConvert.SerializeObject(category)));
150             }
151             catch (Exception e)
152             {
153                 logger.Error("SaveWarn出现异常", e);
154             }
155             finally
156             {
157                 if (sw != null)
158                 {
159                     sw.Flush();
160                     sw.Close();
161                     sw.Dispose();
162                 }
163             }
164         }
165     }
View Code

   京东爬虫使用说明:

  • 配置app.config的DataPath(这个是文本数据存储的地址);
  • 创建一个sqlerver数据库,配置app.config的数据库连接;
  • 运行项目,可以直接vs运行,或者使用Crawler\bin\Debug下面的Crawler.exe;
  • 数据库表结构是自动创建的,控制台需要输入Y才能开始初始化数据库结构,然后进行数据抓取;
  • 看看控制台有无提示异常,看看数据库的数据即可。

类别大概是1300+ 商品是800W+

posted @ 2020-04-28 11:01  孝文  阅读(155)  评论(0)    收藏  举报