java爬虫
首先,创建一个Maven项目
一,导包

二,创建一个测试类

注意:url为网页地址
   
模拟浏览器的头信息
需要爬的网页:

结果:

接下来就是爬一个大的数据
首先,爬下来放到redis中,然后存到mysql数据库

只需要写5个类,两个实体类,一个dao,一个放到redis中的类,一个存进mysql的类
配置pom.xml
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>

 
package com.nf147.ojp; public class Policy { private String title; private String url; private String content; private String basis; private String info; private PolicySource policySource; public PolicySource getPolicySource() { return policySource; } public void setPolicySource(PolicySource policySource) { this.policySource = policySource; } public String getBasis() { return basis; } public void setBasis(String basis) { this.basis = basis; } public String getInfo() { return info; } public void setInfo(String info) { this.info = info; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } @Override public String toString() { return "Policy{" + "title='" + title + '\'' + ", url='" + url + '\'' + ", content='" + content + '\'' + ", basis='" + basis + '\'' + ", info='" + info + '\'' + ", policySource=" + policySource + '}'; } }

 
package com.nf147.ojp; public class PolicySource { private String url; private String title; private String content; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } @Override public String toString() { return "PolicySource{" + "url='" + url + '\'' + ", title='" + title + '\'' + ", content='" + content.substring(0,20) + '\'' + '}'; } }

 
package com.nf147.ojp; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class PolicyDAO { //186个页面 //定义官网的地址 public String hostURL = "http://www.zhsme.gov.cn"; //定义列表页面 //抓取列表 public List<Policy> getListInfo(int pageNum) throws IOException { String policyListUrl = "http://www.zhsme.gov.cn/policy/getPolicyList?pageNum=" + pageNum + "&NameOrWords=&areaSreachValue=&areaSreachId=&scaleSreachValue=&scaleSreachId=&levelSreachValue=&levelSreachId=&isShuangChuang="; List<Policy> list = new ArrayList<>(); Policy item = null; //获取到列表 Element doc = Jsoup.connect(policyListUrl) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36") .get(); Elements listTitle = doc.select(".list-content.list-content-1 h4 a"); for (Element next : listTitle) { item = new Policy(); //政策信息 item.setPolicySource(new PolicySource()); //设置url item.setUrl(hostURL + next.attr("href")); //设置标题 item.setTitle(next.attr("title")); list.add(item); } return list; } //抓取政策信息 public Policy getPolicyInfo(Policy policy) throws IOException { //定义政策源页面 String policyUrl = "http://www.zhsme.gov.cn/policy/getTextPolicyByTextPolicyId?textPolicyId="; //抓取政策信息 Element content = Jsoup.connect(policy.getUrl()) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36") .get(); //政策依据 policy.setBasis(content.select(".policy-txt.clear p").text()); //政策相关信息 policy.setInfo(content.select(".policy-con p").text()); //筛选政策连接地址 Elements select = content.select(".policy-txt.clear > a"); //政策标题 String policySourceTitle = select.text(); policy.getPolicySource().setTitle(policySourceTitle); //政策明细 String text = content.select(".part-warp.part-one.clear").text(); policy.setContent(text); String onclick = select.attr("onclick"); //连接地址 String policyUrlId = onclick.substring(onclick.indexOf("'") + 1, onclick.lastIndexOf("'")); policy.getPolicySource().setUrl(policyUrl + policyUrlId); return policy; } //抓取政策源文件 public Policy getPolicyBasisInfo(Policy policy) throws IOException { // 抓取政策源页面 Element policyBasis = Jsoup.connect(policy.getPolicySource().getUrl()) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36") .get(); PolicySource policySource = policy.getPolicySource(); //政策源内容 policySource.setContent(String.valueOf(policyBasis.select(".view-content"))); return policy; } //抓取总页数 public int getSumPageNum() throws IOException { String policyListUrl = "http://www.zhsme.gov.cn/policy/getPolicyList?pageNum=1&NameOrWords=&areaSreachValue=&areaSreachId=&scaleSreachValue=&scaleSreachId=&levelSreachValue=&levelSreachId=&isShuangChuang="; Document document = Jsoup.connect(policyListUrl) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36") .get(); String script = String.valueOf(document.select("script[type=text/javascript]")); String numText = script.substring(script.indexOf("totalPages"), script.indexOf("visiblePages")); String sumPageNum = numText.substring(numText.indexOf(":") + 1, numText.indexOf(",")); return Integer.valueOf(sumPageNum); } }

 
package com.nf147.ojp; import com.fasterxml.jackson.databind.ObjectMapper; import redis.clients.jedis.Jedis; import java.io.IOException; import java.util.List; public class JsoupReptile { public static void main(String[] args) throws IOException, InterruptedException { //redis Jedis jedis = new Jedis(); ObjectMapper mapper = new ObjectMapper(); PolicyDAO policyDAO = new PolicyDAO(); //定义开始抓取的页面 int nowPageNum = 1; //定义结束页面 (最大186) int maxPageNum = policyDAO.getSumPageNum(); while (nowPageNum <= maxPageNum) { try { List<Policy> listInfo = policyDAO.getListInfo(nowPageNum); for (int j = 1; j < 4; j++) { try { listInfo = policyDAO.getListInfo(nowPageNum); //抓取成功跳出循环 break; } catch (Exception e) { System.out.println("抓取第" + nowPageNum + "页列表出现问题..正在进行第" + j + "重试"); } } for (int i = 0; i < listInfo.size(); i++) { Policy policy = policy = listInfo.get(i); Policy policyInfo = null; for (int j = 1; j < 4; j++) { try { policyInfo = policyDAO.getPolicyInfo(policy); break; } catch (Exception e) { System.out.println("抓取详情" + policy.getUrl() + "时出现错误,正在进行第" + j + "次尝试"); } } for (int j = 1; j < 4; j++) { try { policyInfo = policyDAO.getPolicyBasisInfo(policyInfo); break; } catch (Exception e) { assert policyInfo != null; System.out.println("抓取源" + policyInfo.getPolicySource().getUrl() + "的时候出现错误,正在进行第" + j + "次尝试"); } } listInfo.set(i, policyInfo); } //写入redis jedis.set("list-" + nowPageNum, mapper.writeValueAsString(listInfo)); System.out.println("已爬取第" + nowPageNum + "页"); nowPageNum++; //休眠300毫秒 Thread.sleep(200); } catch (Exception e) { System.out.println("已达到抓取失败次数上限,跳过第" + nowPageNum + "页"); nowPageNum++; } } } }

 
package com.nf147.ojp; import com.fasterxml.jackson.databind.JavaType; import com.fasterxml.jackson.databind.ObjectMapper; import com.mchange.v2.c3p0.ComboPooledDataSource; import redis.clients.jedis.Jedis; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import java.util.Set; public class GetDataByRedis { public static void main(String[] args) throws IOException, InterruptedException, SQLException { // int[] list = {168,180,183,185}; // // Jedis jedis = new Jedis(); // ObjectMapper mapper = new ObjectMapper(); // // // PolicyDAO policyDAO = new PolicyDAO(); // // // // for (int i : list) { // try{ // List<Policy> listInfo = policyDAO.getListInfo(i); // // for (int j = 0; j < listInfo.size(); j++) { // Policy policy = listInfo.get(j); // Policy policyInfo = policyDAO.getPolicyInfo(policy); // Policy policyBasisInfo = policyDAO.getPolicyBasisInfo(policyInfo); // listInfo.set(j, policyBasisInfo); // } // // jedis.set("list-" + i, mapper.writeValueAsString(listInfo)); // // System.out.println("已爬取第" + i + "页"); // //休眠300毫秒 // Thread.sleep(300); // } catch (Exception e) { // System.out.println(e.getMessage()); // System.out.println("出错了第" + i + "页"); // } // // } Jedis jedis = new Jedis(); Set<String> keys = jedis.keys("list-*"); ObjectMapper mapper = new ObjectMapper(); List<Policy> list = null; ComboPooledDataSource source = new ComboPooledDataSource("mysql"); source.setJdbcUrl("jdbc:mariadb://localhost:3307/zqy"); source.setUser("root"); source.setPassword("123456"); Connection connection = source.getConnection(); int i = 0; long startTime = System.currentTimeMillis(); //关闭自动提交 connection.setAutoCommit(false); PreparedStatement prep = null; for (String key : keys) { try { String s = jedis.get(key); JavaType javaType = mapper.getTypeFactory().constructCollectionType(ArrayList.class, Policy.class); list = (List<Policy>) mapper.readValue(s, javaType); for (Policy policy : list) { prep = connection.prepareStatement("INSERT INTO `zqy`.`policy` (`title`, `url`, `content`, `basis`, `info`, `policy_source_url`, `policy_source_title`, `policy_source_content`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)"); prep.setString(1, policy.getTitle()); prep.setString(2, policy.getUrl()); prep.setString(3, policy.getContent()); prep.setString(4, policy.getBasis()); prep.setString(5, policy.getInfo()); prep.setString(6, policy.getPolicySource().getUrl()); prep.setString(7, policy.getPolicySource().getTitle()); prep.setString(8, policy.getPolicySource().getContent()); prep.executeUpdate(); i++; } list = null; } catch (Exception e) { continue; } // System.out.println(key); } //提交事务 connection.commit(); prep.close(); connection.close(); long endTime = System.currentTimeMillis(); System.out.println("花费时间" + (endTime - startTime) + "毫秒"); System.out.println("共写入数据" + i + "条"); } }
数据库
 
create database zqy; use zqy; create table policy ( id int primary key Auto_increment, title varchar(200), url varchar(500), content longText, basis varchar(500), info varchar(500), policy_source_url varchar(500), policy_source_title text, policy_source_content longText ); select count(*) from policy;
然后、、、、、、、、、、、自己搞结果、、、、、、、、
 
                    
                
 
 
                
            
         浙公网安备 33010602011771号
浙公网安备 33010602011771号