【爬虫入门】HttpClient+Jsoup进行简单的网页访问和信息保存

【项目选型】

(Maven)SpringBoot+JPA

【项目搭建】

pom.xml:

<parent>
        <artifactId>spring-boot-starter-parent</artifactId>
        <groupId>org.springframework.boot</groupId>
        <version>2.5.0</version>
    </parent>

    <dependencies>

        <!--spring-boot-mvc-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--springData JPA-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

        <!--mysql-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.14.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>

    </dependencies>
View Code

AppMain.class:

@SpringBootApplication
/**
 * 开启定时任务
 */
@EnableScheduling
public class AppMain {
    public static void main(String[] args) {
        SpringApplication.run(AppMain.class,args);
    }
}
View Code

 

【分析】

【具体实现】

POJO+JPA+SQL建表

POJO类:
@Table(name = "md_item")
@Entity
public class Product {
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    String id;
    String proid;
    String proauthor;
    String protitle;
    String probackcount;
    String probackermoney;
    String promoneypercent;
    String starttime;
    String endtime;
    String protype;
    String prostatus;
    String proimgpath;
    // get/set/toString
}

JPA:
/**
 * extends JpaRepository<PoJo类,Key主键类型>
 */
public interface ProductDao extends JpaRepository<Product,Long> {
}

SQL:
DROP TABLE IF EXISTS `md_item`;
CREATE TABLE `md_item` (
  `id` bigint(20) NOT NULL AUTO_INCREMENT,
  `proid` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `proauthor` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `protitle` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `probackcount` varchar(20) DEFAULT NULL,
  `probackermoney` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `promoneypercent` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `starttime` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `endtime` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `protype` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `prostatus` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  `proimgpath` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=82 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
准备工作

 

封装HttpClient 实现获取网页代码和图片下载

/**
 * 封装HttpClient,交给Spring容器管理
 */
@Component
public class HttpUtils {
    //httpclient连接池!
    private PoolingHttpClientConnectionManager clientConnectionManager;

    /**
     * 在构造方法中new一个
     */
    public HttpUtils() {
        this.clientConnectionManager = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        this.clientConnectionManager.setMaxTotal(100);
        //设置每个主机的最大连接数
        this.clientConnectionManager.setDefaultMaxPerRoute(10);
    }

    /**
     * 使用get请求获得页面
     * @param url
     * @return 页面数据
     */
    public String doGetHtml(String url){
        //获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.clientConnectionManager).build();
        //创建HttpGet请求对象,设置url地址
        HttpGet httpGet = new HttpGet(url);
        //设置请求信息
        httpGet.setConfig(this.getConfig());
        CloseableHttpResponse httpResponse=null;
        //使用HttpClient发起请求,获得相应
        try {
            httpResponse = httpClient.execute(httpGet);
            if(httpResponse.getStatusLine().getStatusCode()==200){
                //判断Entity是否为空,如果不为空就可以使用EntityUtils
                if(httpResponse.getEntity()!=null){
                    String content = EntityUtils.toString(httpResponse.getEntity(),"utf-8");
                    return content;
                }else{
                    return "ERROR";
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(httpResponse!=null){
                try {
                    httpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        //解析响应,返回结果
        return "";
    }

    /**
     * 设置RequestConfig
     * @return
     */
    private RequestConfig getConfig() {
        RequestConfig config=RequestConfig.custom()
                .setConnectTimeout(1000)    //创建链接的最长时间
                .setConnectionRequestTimeout(500)   //获取链接的最长时间
                .setSocketTimeout(10000)    //数据传输的最长时间
                .build();
        return config;
    }

    /**
     * 下载图片
     * @param url
     * @return 图片名称
     */
    public String doGetImage(String url){
//获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.clientConnectionManager).build();
        //创建HttpGet请求对象,设置url地址
        HttpGet httpGet = new HttpGet(url);
        //设置请求信息
        httpGet.setConfig(this.getConfig());
        CloseableHttpResponse httpResponse=null;
        //使用HttpClient发起请求,获得相应
        try {
            httpResponse = httpClient.execute(httpGet);
            if(httpResponse.getStatusLine().getStatusCode()==200){
                //判断Entity是否为空,如果不为空就可以使用EntityUtils
                if(httpResponse.getEntity()!=null){
                    //下载图片
                    //获取图片后缀
                    String exName=url.substring(url.lastIndexOf("."));
                    //创建图片名,重命名图片
                    String picName= UUID.randomUUID().toString()+exName;
                    //下载图片
                    OutputStream outputStream=new FileOutputStream(new File("G:/IJDailyCode/Crawler/src/main/resources/downloadImg/"+picName));
                    httpResponse.getEntity().writeTo(outputStream);
                    //返回图片名称
                    return picName;
                }else{
                    return "ERROR";
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(httpResponse!=null){
                try {
                    httpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        //出现其他问题返回结果
        return "";
    }

任务方法:

@Component
public class GetTask {
    @Autowired
    HttpUtils httpUtils;
    @Autowired
    ProductService productService;

    @Scheduled(fixedDelay = 100*1000)
    public void crawlerMain(){
        String mainUrl="https://zhongchou.modian.com/all/top_time/going/";
        /*页码for循环*/
        for(int page=1;page<=8;++page){
            String tempUrl=mainUrl+page;
            String html = httpUtils.doGetHtml(tempUrl);
            /*解析页面,获取商品数据*/
            this.parse(html);
        }
    }

    private void parse(String html) {
        Document doc = Jsoup.parse(html);
        Elements proElms = doc.select("div.pro_field > ul > li");
        for (Element proElm:proElms) {
            /**
             * 乱七八糟的搜寻匹配项
             */
            String proId = proElm.attr("data-pro-id");
            String proAuthor=proElm.select("div.author > a > p").text();
            // 重复的使用选择器找信息。。。。。。。。。。。。。
            String imgUrl=infoDoc.getElementById("big_logo").attr("src");

            //如果实体的属性是null,它就会忽略它,这里只传一个proId参数就好
            Product proExample=new Product();
            proExample.setProid(proId);
            
            //查询并判断数据是否存在
            List<Product> examples = productService.findAll(proExample);
            System.out.println("list有无数据:"+examples.size());
            if(examples.size()>0){
                System.out.println("===数据已存在===");
                continue;
            }
            
            /*绑定数值*/
            proExample.setProauthor(proAuthor);
            // 重复的数值绑定操作。。。。。。。。。
            proExample.setProstatus(proStatus);
            /*下载图片*/
            String proImgPath=httpUtils.doGetImage(imgUrl);
            proExample.setProimgpath(proImgPath);
            /*提交保存*/
            productService.save(proExample);
        }
    }
}    

最没用的Service

@Service
@Transactional
public class ProductServiceImpl implements ProductService {
    @Autowired
    private ProductDao productDao;
    @Override
    public void save(Product product) {
        this.productDao.save(product);
    }

    @Override
    public List<Product> findAll(Product product) {
        Example<Product> example = Example.of(product);
        List<Product> products = productDao.findAll(example);
        return products;
    }
}

【保存结果展示】

【遇到问题】

  网页get下来本身的数据就是NULL,无法判断,导致保存失败。懒得加判断了。

  【重要】在判断数据是否重复时,使用JPARepository的findAll的Example方式,提供一个POJO模板进行自动查询,返回的List.size()始终为0,导致重复数据还会保存。

  【重要】@Table(name=" 表名")报红

 

posted @ 2021-09-06 11:10  YFEYI  阅读(176)  评论(0编辑  收藏  举报