webmagic爬虫简单应用

public class TestSpider implements PageProcessor {


    private Site site = Site.me().setRetryTimes(3).setSleepTime(100)
            .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");

    public static int count;

    @Override
    public Site getSite() {
        return site;
    }
    @Override
    public void process(Page page) {
        int num = 0;

        //因为详情页的域名不一样,所以用.*代替
        if (page.getUrl().regex(".*/a/[0-9]+\\.html").match()) {
            String newTitle =  page.getHtml().xpath("/html/body/div[2]/div/div[3]/div[1]/div[2]/div[1]/h1").toString();
            page.putField("newTitle",newTitle);
            System.out.println("newTitle = "+ newTitle);
            String newContext = page.getHtml().xpath("//*[@id=\"ContentBody\"]/p/text()").all().toString();
            page.putField("newContext", newContext);
            System.out.println("newContext = " + newContext);
        } else {
            while (true) {
                //获取页面需要的内容
                String img = page.getHtml().xpath("//*[@id=\"newsTr"+num+"\"]/div[1]/a/@href").toString();
                if (!StringUtils.isNotBlank(img)) {
                    break;
                }
                System.out.println("img="+ img);
                String title = page.getHtml().xpath("//*[@id=\"newsTr"+num+"\"]/div[2]/p[1]/a/@href").toString();
                System.out.println("title = "+ title);
                String context = page.getHtml().xpath("//*[@id=\"newsTr"+num+"\"]/div[2]/p[2]/text()").toString();
                System.out.println("context = " + context);
                String time = page.getHtml().xpath("//*[@id=\"newsTr"+ num +"\"]/div[2]/p[3]/text()").toString();
                System.out.println("time = " + time);

                num++;
            }
            //将详情页面的地址添加到page中
            page.addTargetRequests(page.getHtml().xpath("//*[@id=\"newsListContent\"]/li/div[2]/p[1]/a/@href").all());
            //将下一页的地址添加到page中
            page.addTargetRequests(page.getHtml().xpath("//*[@id=\"pagerNoDiv\"]/a[@class='page-btn']/@href").all());
        }

        count ++;

    }

    public static void main(String[] args) {
        long startTime, endTime;
        System.out.println("开始爬取...");
        startTime = System.currentTimeMillis();
        Spider.create(new TestSpider()).addUrl("http://futures.eastmoney.com/news/cqhdd.html").thread(1).run();
        endTime = System.currentTimeMillis();
        System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
    }
}

 

webmagic地址: http://webmagic.io/

 

posted @ 2019-04-11 19:35  qqq齐  阅读(122)  评论(0)    收藏  举报