1 package com.mieba.spider;
 2 
 3 import java.util.ArrayList;
 4 import java.util.List;
 5 import java.util.Vector;
 6 
 7 import us.codecraft.webmagic.Page;
 8 import us.codecraft.webmagic.Site;
 9 import us.codecraft.webmagic.processor.PageProcessor;
10 import us.codecraft.webmagic.selector.Html;
11 
12 public class WanhoPageProcessor implements PageProcessor
13 {
14 
15     private Site site = Site
16             .me()
17             .setTimeOut(10000)
18             .setRetryTimes(3)
19             .setSleepTime(1000)
20             .setCharset("UTF-8");
21 
22     @Override
23     public Site getSite()
24     {
25         // TODO Auto-generated method stub
26         return site;
27     }
28 
29     @Override
30     public void process(Page page)
31     {
32         // TODO Auto-generated method stub
33         //获取当前页的所有喜报
34          List<String> list = page.getHtml().xpath("//div[@class='main_l']/ul/li").all();
35         //要保存喜报的集合
36         Vector<ArticleVo> voLst = new Vector<>();
37       //遍历喜报
38         String title;
39         String content;
40         String img;
41         for (String item : list) 
42         {
43             Html tmp = Html.create(item);
44             //标题
45             title = tmp.xpath("//div[@class='content']/h4/a/text()").toString();
46             //内容
47             content = tmp.xpath("//div[@class='content']/p/text()").toString();
48             //图片路径
49             img = tmp.xpath("//a/img/@src").toString();
50             //加入集合
51             ArticleVo vo = new ArticleVo(title, content, img);
52             voLst.add(vo);
53         }
54       //保存数据至page中,后续进行持久化
55         page.putField("e_list", voLst);
56       //加载其它页
57         page.addTargetRequests( getOtherUrls());
58     }
59     
60     
61     //其它页
62     public List<String> getOtherUrls()
63     {
64          List<String> urlLsts = new ArrayList<>();
65          for(int i=2;i<7;i++){
66              urlLsts.add("http://www.wanho.net/a/jyxb/list_15_"+i+".html");
67          }
68         return urlLsts;
69     }
70 
71 }
  1 package com.mieba.spider;
  2 
  3 import java.io.BufferedInputStream;
  4 import java.io.BufferedOutputStream;
  5 import java.io.File;
  6 import java.io.FileNotFoundException;
  7 import java.io.FileOutputStream;
  8 import java.io.FileWriter;
  9 import java.io.IOException;
 10 import java.io.InputStream;
 11 import java.io.PrintWriter;
 12 import java.net.MalformedURLException;
 13 import java.net.URL;
 14 import java.net.URLConnection;
 15 import java.util.Vector;
 16 
 17 import us.codecraft.webmagic.ResultItems;
 18 import us.codecraft.webmagic.Task;
 19 import us.codecraft.webmagic.pipeline.Pipeline;
 20 
 21 public class WanhoPipeline implements Pipeline
 22 {
 23 
 24     @Override
 25     public void process(ResultItems resultItems, Task arg1)
 26     {
 27         // TODO Auto-generated method stub
 28         // 获取抓取过程中保存的数据
 29         Vector<ArticleVo> voLst = resultItems.get("e_list");
 30         // 持久到文件中
 31         PrintWriter pw = null;
 32         try
 33         {
 34             pw = new PrintWriter(new FileWriter("wanho.txt", true));
 35             for (ArticleVo vo : voLst)
 36             {
 37                 pw.println(vo);
 38                 pw.flush();
 39                 saveImg(vo.getImg());
 40             }
 41         } catch (FileNotFoundException e)
 42         {
 43             e.printStackTrace();
 44         } catch (IOException e)
 45         {
 46             e.printStackTrace();
 47         } finally
 48         {
 49             pw.close();
 50         }
 51     }
 52 
 53     private void saveImg(String img)
 54     {
 55         // TODO Auto-generated method stub
 56         String imgUrl = "http://www.wanho.net" + img;
 57         InputStream is = null;
 58         BufferedInputStream bis = null;
 59         BufferedOutputStream bos = null;
 60         try
 61         {
 62             URL url = new URL(imgUrl);
 63             URLConnection uc = url.openConnection();
 64             is = uc.getInputStream();
 65             bis = new BufferedInputStream(is);
 66             File photoFile = new File("photo");
 67             if (!photoFile.exists())
 68             {
 69                 photoFile.mkdirs();
 70             }
 71             String imgName = img.substring(img.lastIndexOf("/") + 1);
 72             File saveFile = new File(photoFile, imgName);
 73             bos = new BufferedOutputStream(new FileOutputStream(saveFile));
 74             byte[] bs = new byte[1024];
 75             int len;
 76             while ((len = bis.read(bs)) != -1)
 77             {
 78                 bos.write(bs, 0, len);
 79             }
 80 
 81         } catch (MalformedURLException e)
 82         {
 83             // TODO: handle exception
 84             e.printStackTrace();
 85         } catch (IOException e)
 86         {
 87             e.printStackTrace();
 88         } finally
 89         {
 90             try
 91             {
 92                 bos.close();
 93             } catch (IOException e)
 94             {
 95                 e.printStackTrace();
 96             }
 97             try
 98             {
 99                 bis.close();
100             } catch (IOException e)
101             {
102                 e.printStackTrace();
103             }
104             try
105             {
106                 is.close();
107             } catch (IOException e)
108             {
109                 e.printStackTrace();
110             }
111 
112         }
113     }
114 
115 }
 1 package com.mieba.spider;
 2 
 3 public class ArticleVo
 4 {
 5     private String title;
 6     private String content;
 7     private String img;
 8     public String getTitle()
 9     {
10         return title;
11     }
12     public void setTitle(String title)
13     {
14         this.title = title;
15     }
16     public String getContent()
17     {
18         return content;
19     }
20     public void setContent(String content)
21     {
22         this.content = content;
23     }
24     public String getImg()
25     {
26         return img;
27     }
28     public void setImg(String img)
29     {
30         this.img = img;
31     }
32     public ArticleVo(String title, String content, String img)
33     {
34         super();
35         this.title = title;
36         this.content = content;
37         this.img = img;
38     }
39     @Override
40     public String toString()
41     {
42         return "ArticleVo [title=" + title + ", content=" + content + ", img=" + img + "]";
43     }
44     
45     
46 }
package com.mieba.spider;

import us.codecraft.webmagic.Spider;

public class Demo
{
    public static void main(String[] args)
    { // 爬取开始
        Spider 
        // 爬取过程 
        .create(new WanhoPageProcessor()) 
        // 爬取结果保存
        .addPipeline(new WanhoPipeline())
        // 爬取的第一个页面
        .addUrl("http://www.wanho.net/a/jyxb/") 
        // 启用的线程数
        .thread(5).run();
        }
    }


        

爬取到的照片

 

 爬取到的简报

 

 大家如果要使用代码,配置webmagic的依赖包即可使用。

posted on 2020-02-13 15:01  生长的力量  阅读(228)  评论(0编辑  收藏  举报