Webmagic JAVA爬虫
怕例子网站:http://my.oschina.net/flashsword/blog
需要得到网站的这一块的数据

引入jar包
<!-- web magic -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
java测试代码:
package com.chenpeng.cpeducloud; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.io.IOException; import java.util.List; /**auth : szy *time : 2019-09-06 **/ public class Test7 implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net"); public static void main(String[] args) throws IOException { //目的网页URL地址 Spider.create(new Test7()) //从https://github.com/code4craft开始抓 .addUrl("http://my.oschina.net/flashsword/blog") //设置Scheduler,使用Redis来管理URL队列 //.setScheduler(new RedisScheduler("localhost")) //设置Pipeline,将结果以json方式保存到文件 .addPipeline(new ConsolePipeline()) //开启5个线程同时执行 .thread(1) //启动爬虫 .run(); } @Override public void process(Page page) { //它表示匹配所有"http://my\.oschina\.net/flashsword/blog/d+ d+表示一到多个数字"这样的链接。 List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); //XPath page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); //Css选择器 page.putField("content", page.getHtml().$("div.content").toString()); //从html中提取元素 page.putField("codeBlock", page.getHtml().$("div.codeBlock").toString()); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @Override public Site getSite() { return site; } }
运行后输出的结果:

如果是POST请求
private Site site = Site.me() .setCharset("UTF-8") .setSleepTime(1) .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0"); public static void main(String[] args) throws IOException { Request request = new Request("http://www.baidu.com"); request.setMethod(HttpConstant.Method.POST); Map<String, Object> data = new HashMap<>(); data.put("queryDate","2222"); data.put("idcertnum","111"); request.setRequestBody(HttpRequestBody.form(data,"utf-8")); //目的网页URL地址 Spider.create(new Test7()) //从https://github.com/code4craft开始抓 .addRequest(request) //.addUrl("http://my.oschina.net/flashsword/blog") //设置Scheduler,使用Redis来管理URL队列 //.setScheduler(new RedisScheduler("localhost")) //设置Pipeline,将结果以json方式保存到文件 .addPipeline(new ConsolePipeline()) //开启5个线程同时执行 .thread(1) //启动爬虫 .run(); }
实际使用过程中需要使用代理IP,对面可能针对IP做了限制,如每个IP只能请求多少次之类的.
这里测试使用免费的Ip,https://www.kuaidaili.com/free/inha/1/ 掉的概率还是挺大的,供学习使用
{ //使用代理IP进行发送请求 HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); List<IPProject> list = new ArrayList<>(); IPProject ipProject = new IPProject(); ipProject.setIp("183.247.211.151"); ipProject.setPort("30001"); list.add(ipProject); ipProject = new IPProject(); ipProject.setIp("222.78.6.190"); ipProject.setPort("8083"); list.add(ipProject); Random random = new Random(); int ip_index = random.nextInt(1); ip_index = 1; log.info("ip_index=" + ip_index); IPProject ips = list.get(ip_index); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(ips.getIp(), Integer.parseInt(ips.getPort())))); for (int i=0;i<5;i++) { Spider.create(new NetyunguanProcessor()) //从https://github.com/code4craft开始抓 //.addRequest(request) .addUrl("http://www.baidu.com") //设置Scheduler,使用Redis来管理URL队列 //.setScheduler(new RedisScheduler("localhost")) //设置Pipeline,将结果以json方式保存到文件 //.addPipeline(new ConsolePipeline()) //开启5个线程同时执行 .thread(1) .setExitWhenComplete(true) .setDownloader(httpClientDownloader) //启动爬虫 .run(); log.info("i======================"+i); } }
后台看请求的IP发现IP地址是不停的变化的
浙公网安备 33010602011771号