Webmagic JAVA爬虫

 

怕例子网站:http://my.oschina.net/flashsword/blog

需要得到网站的这一块的数据

引入jar包

<!-- web magic -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

java测试代码:

package com.chenpeng.cpeducloud;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.io.IOException;
import java.util.List;

/**auth : szy
 *time : 2019-09-06
 **/
public class Test7 implements PageProcessor {

    private Site site = Site.me().setDomain("my.oschina.net");

    public static void main(String[] args) throws IOException {

        //目的网页URL地址
        Spider.create(new Test7())
                //从https://github.com/code4craft开始抓
                .addUrl("http://my.oschina.net/flashsword/blog")
                //设置Scheduler,使用Redis来管理URL队列
                //.setScheduler(new RedisScheduler("localhost"))
                //设置Pipeline,将结果以json方式保存到文件
                .addPipeline(new ConsolePipeline())
                //开启5个线程同时执行
                .thread(1)
                //启动爬虫
                .run();

    }


    @Override
    public void process(Page page) {

        //它表示匹配所有"http://my\.oschina\.net/flashsword/blog/d+   d+表示一到多个数字"这样的链接。
        List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
        page.addTargetRequests(links);

        //XPath
        page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
        //Css选择器
        page.putField("content", page.getHtml().$("div.content").toString());
        //从html中提取元素
        page.putField("codeBlock", page.getHtml().$("div.codeBlock").toString());
        page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
    }

    @Override
    public Site getSite() {
        return site;
    }
}

运行后输出的结果:

 

如果是POST请求

   private Site site = Site.me()
            .setCharset("UTF-8")
            .setSleepTime(1)
            .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0");

    public static void main(String[] args) throws IOException {

        Request request = new Request("http://www.baidu.com");
        request.setMethod(HttpConstant.Method.POST);
        Map<String, Object> data = new HashMap<>();
        data.put("queryDate","2222");
        data.put("idcertnum","111");

        request.setRequestBody(HttpRequestBody.form(data,"utf-8"));
        //目的网页URL地址
        Spider.create(new Test7())
                //从https://github.com/code4craft开始抓
                .addRequest(request)
                //.addUrl("http://my.oschina.net/flashsword/blog")
                //设置Scheduler,使用Redis来管理URL队列
                //.setScheduler(new RedisScheduler("localhost"))
                //设置Pipeline,将结果以json方式保存到文件
                .addPipeline(new ConsolePipeline())
                //开启5个线程同时执行
                .thread(1)
                //启动爬虫
                .run();

    }

 实际使用过程中需要使用代理IP,对面可能针对IP做了限制,如每个IP只能请求多少次之类的.

这里测试使用免费的Ip,https://www.kuaidaili.com/free/inha/1/ 掉的概率还是挺大的,供学习使用

{

        //使用代理IP进行发送请求
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();

        List<IPProject> list = new ArrayList<>();
        IPProject ipProject = new IPProject();
        ipProject.setIp("183.247.211.151");
        ipProject.setPort("30001");
        list.add(ipProject);

        ipProject = new IPProject();
        ipProject.setIp("222.78.6.190");
        ipProject.setPort("8083");
        list.add(ipProject);

        Random random = new Random();
        int ip_index = random.nextInt(1);
        ip_index = 1;
        log.info("ip_index=" + ip_index);

        IPProject ips = list.get(ip_index);

        httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(ips.getIp(), Integer.parseInt(ips.getPort()))));


        for (int i=0;i<5;i++) {

            Spider.create(new NetyunguanProcessor())
                    //从https://github.com/code4craft开始抓
                    //.addRequest(request)
                    .addUrl("http://www.baidu.com")
                    //设置Scheduler,使用Redis来管理URL队列
                    //.setScheduler(new RedisScheduler("localhost"))
                    //设置Pipeline,将结果以json方式保存到文件
                    //.addPipeline(new ConsolePipeline())
                    //开启5个线程同时执行
                    .thread(1)
                    .setExitWhenComplete(true)
                    .setDownloader(httpClientDownloader)
                    //启动爬虫
                    .run();
            log.info("i======================"+i);
        }

    }

后台看请求的IP发现IP地址是不停的变化的

posted on 2021-12-07 08:38  手撕高达的村长  阅读(154)  评论(0)    收藏  举报

导航