webmagic爬虫简单应用
public class TestSpider implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(100) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"); public static int count; @Override public Site getSite() { return site; } @Override public void process(Page page) { int num = 0; //因为详情页的域名不一样,所以用.*代替 if (page.getUrl().regex(".*/a/[0-9]+\\.html").match()) { String newTitle = page.getHtml().xpath("/html/body/div[2]/div/div[3]/div[1]/div[2]/div[1]/h1").toString(); page.putField("newTitle",newTitle); System.out.println("newTitle = "+ newTitle); String newContext = page.getHtml().xpath("//*[@id=\"ContentBody\"]/p/text()").all().toString(); page.putField("newContext", newContext); System.out.println("newContext = " + newContext); } else { while (true) { //获取页面需要的内容 String img = page.getHtml().xpath("//*[@id=\"newsTr"+num+"\"]/div[1]/a/@href").toString(); if (!StringUtils.isNotBlank(img)) { break; } System.out.println("img="+ img); String title = page.getHtml().xpath("//*[@id=\"newsTr"+num+"\"]/div[2]/p[1]/a/@href").toString(); System.out.println("title = "+ title); String context = page.getHtml().xpath("//*[@id=\"newsTr"+num+"\"]/div[2]/p[2]/text()").toString(); System.out.println("context = " + context); String time = page.getHtml().xpath("//*[@id=\"newsTr"+ num +"\"]/div[2]/p[3]/text()").toString(); System.out.println("time = " + time); num++; } //将详情页面的地址添加到page中 page.addTargetRequests(page.getHtml().xpath("//*[@id=\"newsListContent\"]/li/div[2]/p[1]/a/@href").all()); //将下一页的地址添加到page中 page.addTargetRequests(page.getHtml().xpath("//*[@id=\"pagerNoDiv\"]/a[@class='page-btn']/@href").all()); } count ++; } public static void main(String[] args) { long startTime, endTime; System.out.println("开始爬取..."); startTime = System.currentTimeMillis(); Spider.create(new TestSpider()).addUrl("http://futures.eastmoney.com/news/cqhdd.html").thread(1).run(); endTime = System.currentTimeMillis(); System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录"); } }
webmagic地址: http://webmagic.io/

浙公网安备 33010602011771号