Java爬页面数据

 
 <!--爬数据 start-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.9</version>
        </dependency>
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.27</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
<!--爬数据 end-->

一.创建你要爬取的字段实体

package com.tecnon.common.utils;
import lombok.Data;

@Data
public class POItoExcel {
    /**
     * 书名
     */
    private String bookName;
    /**
     * 价格
     */
    private String price;
    /**
     * 作者
     */
    private String author;
    /**
     * 出版社
     */
    private String Press;
    /**
     * 出版时间
     */
    private String pressTime;
}

二.单元测试实现代码
这是我要爬取的页面链接:https://www.bookuu.com/search.php?cid=101702
实现单元测试
public static void main(String[] args) {
    List<POItoExcel> poItoExcelList = new ArrayList<>();
    for (int i = 1; i <= 2; i++) {
        String url = "https://www.bookuu.com/search.php?cid=101702&page=" + i;
        try {
            Document document = Jsoup.connect(url).header("user-agent",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
                    .header("referer", "https://www.bookuu.com/search.php?cid=101702&page=" + i).get();
            Element body = document.body();
            Elements a = body.getElementsByClass("wd-640");
            for (Iterator it = a.iterator(); it.hasNext(); ) {
                POItoExcel poItoExcel = new POItoExcel();
                Element e = (Element) it.next();
                Elements bn = e.getElementsByClass("fs-16");
                Elements p = e.getElementsByClass("fs-21");
                Elements w = e.getElementsByClass("wd-30p fl to-hd mr-10");
                Elements f = e.getElementsByClass("wd-30p fl to-hd cl-9 mr-10");
                Elements t = e.getElementsByClass("wd-30p fl to-hd cl-9");
                //爬到的数据放到list中
                poItoExcel.setBookName(bn.text());
                poItoExcel.setPrice(p.text());
                poItoExcel.setAuthor(w.text());
                poItoExcel.setPress(f.text());
                poItoExcel.setPressTime(t.text());
                poItoExcelList.add(poItoExcel);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        System.out.println("第" + i+ "页结束");
    }
    System.out.println("----"+ StringUtil.getJsonFromObject(poItoExcelList) +"----");
}
有什么问题:加qq:501397578

 

posted @ 2020-08-26 14:52  暖瞳123  阅读(84)  评论(0编辑  收藏  举报