java网页爬虫

1.导入相关jar包

1.作用于页面解析 
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.2</version> </dependency>
2.http请求

<dependency>
    <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>

 

2.获取页面源码

 //1.创建对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //设置请求地址http://yun.itheima.com/search?keys=java
        URIBuilder uriBuilder = new URIBuilder("https://xy.51job.com/default-xs.php");
        //设置请求信息
//        uriBuilder.setParameter("callback","jQuery1192927");
//        uriBuilder.setParameter("page","1");
//        uriBuilder.setParameter("productId","100008348530");

        //2.访问地址
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        // 模拟浏览器浏览(user-agent的值可以通过浏览器浏览,查看发出请求的头文件获取)
        httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36");
        //3.发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        //4.解析响应
        if (response.getStatusLine().getStatusCode() == 200) {
        // HttpEntity表示http的request和resposne实体,它由消息头和消息体组成。
        // 从HttpEntity中可以获取http请求头和回应头,也可以获取http请求体和回应体信息。
        //HttpEntity的使用,与@RequestBody 、@ResponseBody类似。
            HttpEntity entity = response.getEntity();
            String content = EntityUtils.toString(entity, "gb2312");
            //解析页面获取数据
            tests(content);
//            System.out.println(content);
        }
        //5.关闭response
        response.close();
        httpClient.close();

 

3.解析页面  1.通过jQuery 语法来获取页面信息

  //1.解析Url地址
        Document doc = Jsoup.parse(string);
        //2.使用标签选择器,获取title   
        Elements spuEles = doc.select("div.ctxt > div.cell > div.e");
        System.out.println(String.valueOf(spuEles));
        for (Element spuEle : spuEles) {
            //获取招聘标题
            String topic = spuEle.select("[target]").attr("title");
            System.out.println("==================topic================" + topic);
            //获取招聘公司url
            String url = spuEle.select("[target]").attr("href");
        }

 

4.解析页面  2.获取页面js 数据  进行 爬取

   //获取数据
        String iphone = iphone();
        System.err.println(iphone);
        if(!"".equals(iphone)){
            //掐头
            String replace = iphone.replace("fetchJSON_comment98(", "");
            //去尾
            String data = replace.substring(0, replace.lastIndexOf(";") - 1);

            JSONObject action = JSONObject.parseObject(data);

            JSONArray comments = action.getJSONArray("comments");

            List<Comments> commentsList = comments.toJavaList(Comments.class);

            for (Comments string : commentsList) {
                System.out.println("===============用户===================="+string);
                commentService.insert(string);
            }
            if(page <= 100){
                System.err.println(page);
                insert();
            }

        }else {

            System.out.println("===========没有用户==============");
        }
  

 

posted @ 2021-09-30 08:36  takeoff_zy  阅读(114)  评论(0)    收藏  举报