java网页爬虫
1.导入相关jar包
1.作用于页面解析
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.2</version> </dependency>
2.http请求
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
2.获取页面源码
//1.创建对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //设置请求地址http://yun.itheima.com/search?keys=java URIBuilder uriBuilder = new URIBuilder("https://xy.51job.com/default-xs.php"); //设置请求信息 // uriBuilder.setParameter("callback","jQuery1192927"); // uriBuilder.setParameter("page","1"); // uriBuilder.setParameter("productId","100008348530"); //2.访问地址 HttpGet httpGet = new HttpGet(uriBuilder.build()); // 模拟浏览器浏览(user-agent的值可以通过浏览器浏览,查看发出请求的头文件获取) httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"); //3.发起请求 CloseableHttpResponse response = httpClient.execute(httpGet); //4.解析响应 if (response.getStatusLine().getStatusCode() == 200) { // HttpEntity表示http的request和resposne实体,它由消息头和消息体组成。 // 从HttpEntity中可以获取http请求头和回应头,也可以获取http请求体和回应体信息。 //HttpEntity的使用,与@RequestBody 、@ResponseBody类似。 HttpEntity entity = response.getEntity(); String content = EntityUtils.toString(entity, "gb2312"); //解析页面获取数据 tests(content); // System.out.println(content); } //5.关闭response response.close(); httpClient.close();
3.解析页面 1.通过jQuery 语法来获取页面信息
//1.解析Url地址 Document doc = Jsoup.parse(string); //2.使用标签选择器,获取title Elements spuEles = doc.select("div.ctxt > div.cell > div.e"); System.out.println(String.valueOf(spuEles)); for (Element spuEle : spuEles) { //获取招聘标题 String topic = spuEle.select("[target]").attr("title"); System.out.println("==================topic================" + topic); //获取招聘公司url String url = spuEle.select("[target]").attr("href"); }
4.解析页面 2.获取页面js 数据 进行 爬取
//获取数据 String iphone = iphone(); System.err.println(iphone); if(!"".equals(iphone)){ //掐头 String replace = iphone.replace("fetchJSON_comment98(", ""); //去尾 String data = replace.substring(0, replace.lastIndexOf(";") - 1); JSONObject action = JSONObject.parseObject(data); JSONArray comments = action.getJSONArray("comments"); List<Comments> commentsList = comments.toJavaList(Comments.class); for (Comments string : commentsList) { System.out.println("===============用户===================="+string); commentService.insert(string); } if(page <= 100){ System.err.println(page); insert(); } }else { System.out.println("===========没有用户=============="); }

浙公网安备 33010602011771号