基于【 springBoot+jsoup】一 || 爬取全国行政区划数据
一、代码演示
如果中途中断,可进行刷选过滤已拉取省份数据
/** * TODO * * @author kevin * @createTime 2019-11-18 19:37 */ @RestController public class CityController { @Autowired private ProvinceService provinceService; @Autowired private HttpUtil httpUtil; private String yearHref = ""; private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"}; @GetMapping("/start") public ResultTemplate<String> spider() throws Exception { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"; String charset = "gb2312"; Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) { return of("fail"); } Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0); // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接 Document doc = httpUtil.get(yearHref, charset); // 遍历所有的省 Elements provinceElements = doc.getElementsByClass("provincetr"); for (Element element : provinceElements) { Elements aEles = element.select("a"); for (Element aEle : aEles) { String name = aEle.text(); // 11.html String provincesHref = aEle.attr("href"); String code = provincesHref.substring(0, provincesHref.indexOf(".")); index = yearHref.lastIndexOf("/") + 1; // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html provincesHref = yearHref.substring(0, index) + provincesHref; DicProvince province = new DicProvince() .setProvinceName(name) .setProvinceCode(code) .setCountryId(1196612453660643329L) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) { System.out.println("未执行市:" + name); } else { System.out.println("开始时间:" + LocalDateTime.now()); System.out.println("省名称:" + name); Long id = provinceService.insertProvince(province); getCites(provincesHref, charset, id); } } } return of("spider crawl end."); } private void getCites(String url, String charset, Long provinceId) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("citytr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("a").get(1); // 第二个是市的名字 String name = aEle.text(); // 11/1101.html String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); cityHref = yearHref.substring(0, index) + cityHref; DicCity city = new DicCity() .setCityName(name) .setCityCode(code) .setProvinceId(provinceId) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCity(city); //Long id=1L; getDistrict(cityHref, charset, id); } } } // 区县 private void getDistrict(String url, String charset, Long idDis) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("countytr"); for (Element cityElement : cityElements) { try { Element aEle = cityElement.select("a").get(1); String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertDistrict(district); //Long id=1L; getStreet(cityHref, charset, id); } catch (Exception e) { System.out.println("市辖区"); Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis); Long id = provinceService.insertDistrict(district); System.out.println("执行完毕"); } } } } // 街道 private void getStreet(String url, String charset, Long idStr) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("towntr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("a").get(1); // 第二个是市的名字 String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicStreet street = new DicStreet() .setStreetName(name) .setStreetCode(code) .setDistrictId(idStr) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertStreet(street); //Long id=1L; getCommunity(cityHref, charset, id); } } } // 社区 private void getCommunity(String url, String charset, Long idPro) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null; System.out.println("请求网页链接报错"); } } i = 0; if (rootDoc != null) { Elements cityElements = rootDoc.getElementsByClass("villagetr"); for (Element cityElement : cityElements) { Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2); String name = aEle3.text(); DicCommunity community = new DicCommunity() .setCommunityName(name) .setCommunityCode(code) .setClassificationCode(cl_code) .setStreetId(idPro) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCommunity(community); } } } }
二、HttppUtil工具类
/** * TODO * * @author kevin * @createTime 2019-11-20 9:17 */ @Component public class HttpUtil { public Document get(String url, String charset) throws IOException { String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"; URL url2 = new URL(url); HttpURLConnection connection = (HttpURLConnection)url2.openConnection(); connection.setRequestMethod("GET"); //是否允许缓存,默认true。 connection.setUseCaches(Boolean.FALSE); //设置请求头信息 connection.addRequestProperty("Connection", "close"); connection.addRequestProperty("user-agent", userAgent); //设置连接主机超时(单位:毫秒) connection.setConnectTimeout(80000); //设置从主机读取数据超时(单位:毫秒) connection.setReadTimeout(80000); //开始请求 try { Document doc = Jsoup.parse(connection.getInputStream(), charset, url); return doc; } catch (Exception e) { System.out.println("parse error: " + url); } return null; } }
三、service部分,根据需要自行定义数据库表
/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 20:41
*/
@Service
public class ProvinceServiceImpl implements ProvinceService {
@Autowired
private ProvinceMapper provinceMapper;
@Autowired
private CityMapper cityMapper;
@Autowired
private DistrictMapper districtMapper;
@Autowired
private StreetMapper streetMapper;
@Autowired
private CommunityMapper communityMapper;
@Override
public Long insertProvince(DicProvince dicProvince) {
int res=0;
while (res!=1){
try {
res=provinceMapper.insert(dicProvince);
} catch (Exception e) {
res=0;
System.out.println("插入省数据失败");
e.printStackTrace();
}
}
return dicProvince.getProvinceId();
}
@Override
public Long insertCity(DicCity dicCity) {
int res=0;
while(res!=1){
try {
res=cityMapper.insert(dicCity);
} catch (Exception e) {
res=0;
System.out.println("插入市数据失败");
e.printStackTrace();
}
}
return dicCity.getCityId();
}
@Override
public Long insertDistrict(DicDistrict dicDistrict) {
int res=0;
while (res!=1){
try {
res=districtMapper.insert(dicDistrict);
} catch (Exception e) {
res=0;
System.out.println("插入区县数据失败");
e.printStackTrace();
}
}
return dicDistrict.getDistrictId();
}
@Override
public Long insertStreet(DicStreet dicStreet) {
int res=0;
while (res!=1){
try {
res=streetMapper.insert(dicStreet);
} catch (Exception e) {
res=0;
System.out.println("插入街道数据失败");
e.printStackTrace();
}
}
return dicStreet.getStreetId();
}
@Override
public Long insertCommunity(DicCommunity dicCommunity) {
int res=0;
while (res!=1){
try {
res=communityMapper.insert(dicCommunity);
} catch (Exception e) {
res=0;
System.out.println("插入社区数据失败");
e.printStackTrace();
}
}
return dicCommunity.getCommunityId();
}
}

浙公网安备 33010602011771号