jsoup爬取国家统计局全国省市区数据
项目中经常用到全国省市区的数据表,但是这个是数据又会经常变动,每次都需要找最新的数据,很麻烦,特此记录一下,用jsoup爬取国家统计局的数据。

1.引入jar包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
2.创建数据表,生成Haha实体类
DROP TABLE IF EXISTS `haha`;
CREATE TABLE `haha` (
`id` int NOT NULL AUTO_INCREMENT,
`code` bigint DEFAULT NULL,
`name` varchar(255) NOT NULL,
`parent_id` int NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3647 DEFAULT CHARSET=utf8;
3.实现接口
//省份代码json
String provinceJson = "[{\"code\":110100000000,\"id\":1,\"name\":\"北京市\",\"parentId\":0}," +
"{\"code\":120100000000,\"id\":2,\"name\":\"天津市\",\"parentId\":0}," +
"{\"code\":130000000000,\"id\":3,\"name\":\"河北省\",\"parentId\":0}," +
"{\"code\":140000000000,\"id\":4,\"name\":\"山西省\",\"parentId\":0}," +
"{\"code\":150000000000,\"id\":5,\"name\":\"内蒙古自治区\",\"parentId\":0}," +
"{\"code\":210000000000,\"id\":6,\"name\":\"辽宁省\",\"parentId\":0}," +
"{\"code\":220000000000,\"id\":7,\"name\":\"吉林省\",\"parentId\":0}," +
"{\"code\":230000000000,\"id\":8,\"name\":\"黑龙江省\",\"parentId\":0}," +
"{\"code\":310000000000,\"id\":9,\"name\":\"上海市\",\"parentId\":0}," +
"{\"code\":320000000000,\"id\":10,\"name\":\"江苏省\",\"parentId\":0}," +
"{\"code\":330000000000,\"id\":11,\"name\":\"浙江省\",\"parentId\":0}," +
"{\"code\":340000000000,\"id\":12,\"name\":\"安徽省\",\"parentId\":0}," +
"{\"code\":350000000000,\"id\":13,\"name\":\"福建省\",\"parentId\":0}," +
"{\"code\":360000000000,\"id\":14,\"name\":\"江西省\",\"parentId\":0}," +
"{\"code\":370000000000,\"id\":15,\"name\":\"山东省\",\"parentId\":0}," +
"{\"code\":410000000000,\"id\":16,\"name\":\"河南省\",\"parentId\":0}," +
"{\"code\":420000000000,\"id\":17,\"name\":\"湖北省\",\"parentId\":0}," +
"{\"code\":430000000000,\"id\":18,\"name\":\"湖南省\",\"parentId\":0}," +
"{\"code\":440000000000,\"id\":19,\"name\":\"广东省\",\"parentId\":0}," +
"{\"code\":450000000000,\"id\":20,\"name\":\"广西壮族自治区\",\"parentId\":0}," +
"{\"code\":460000000000,\"id\":21,\"name\":\"海南省\",\"parentId\":0}," +
"{\"code\":500000000000,\"id\":22,\"name\":\"重庆市\",\"parentId\":0}," +
"{\"code\":510000000000,\"id\":23,\"name\":\"四川省\",\"parentId\":0}," +
"{\"code\":520000000000,\"id\":24,\"name\":\"贵州省\",\"parentId\":0}," +
"{\"code\":530000000000,\"id\":25,\"name\":\"云南省\",\"parentId\":0}," +
"{\"code\":540000000000,\"id\":26,\"name\":\"西藏自治区\",\"parentId\":0}," +
"{\"code\":610000000000,\"id\":27,\"name\":\"陕西省\",\"parentId\":0}," +
"{\"code\":620000000000,\"id\":28,\"name\":\"甘肃省\",\"parentId\":0}," +
"{\"code\":630000000000,\"id\":29,\"name\":\"青海省\",\"parentId\":0}," +
"{\"code\":640000000000,\"id\":30,\"name\":\"宁夏回族自治区\",\"parentId\":0}," +
"{\"code\":650000000000,\"id\":31,\"name\":\"新疆维吾尔自治区\",\"parentId\":0}," +
"{\"code\":710000000000,\"id\":32,\"name\":\"台湾\",\"parentId\":0}," +
"{\"code\":810000000000,\"id\":33,\"name\":\"香港特别行政区\",\"parentId\":0}," +
"{\"code\":820000000000,\"id\":34,\"name\":\"澳门特别行政区\",\"parentId\":0}]\n";
JSONArray array = JSONArray.parseArray(provinceJson);
//国家统计局地址
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";
//链接到目标地址
Connection connect = Jsoup.connect(url);
//设置useragent,设置超时时间,并以get请求方式请求服务器
Document document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
timeout(6000).ignoreContentType(true).get();
//获取省份列表
ListIterator<Element> elements = document.getElementsByClass("provincetr").listIterator();
while (elements.hasNext()) {
ListIterator<Element> tds = elements.next().children().listIterator();
while (tds.hasNext()) {
Element element = tds.next().child(0);
String provinceName = element.text();
Area province = new Area();
province.setName(provinceName);
for (int i = 0; i < array.size(); i++) {
JSONObject json = array.getJSONObject(i);
if (provinceName.equals(json.getString("name"))) {
province.setCode(json.getLong("code"));
}
}
province.save();
url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + element.attr("href");
connect = Jsoup.connect(url);
document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
timeout(6000).ignoreContentType(true).get();
ListIterator<Element> citys = document.getElementsByClass("citytr").listIterator();
while (citys.hasNext()) {
ListIterator<Element> as = citys.next().getElementsByTag("a").listIterator();
int index = 1;
Area city = new Area();
while (as.hasNext()) {
Element c = as.next();
if (index == 1) {
index++;
city.setCode(Long.parseLong(c.text().trim()));
} else {
url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + c.attr("href");
city.setName(c.text().trim());
}
}
city.setParentId(province.getId());
city.save();
connect = Jsoup.connect(url);
document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
timeout(6000).ignoreContentType(true).get();
ListIterator<Element> countys = document.getElementsByClass("countytr").listIterator();
ListIterator<Element> towns = document.getElementsByClass("towntr").listIterator();
while (countys.hasNext()) {
ListIterator<Element> couna = countys.next().getElementsByTag("td").listIterator();
Area county = new Area();
int countIndex = 1;
while (couna.hasNext()) {
Element a = couna.next();
if (countIndex == 1) {
countIndex++;
county.setCode(Long.parseLong(a.text().trim()));
} else {
county.setName(a.text().trim());
}
}
county.setParentId(city.getId());
county.save();
}
while (towns.hasNext()) {
ListIterator<Element> couna = towns.next().getElementsByTag("td").listIterator();
Area county = new Area();
int countIndex = 1;
while (couna.hasNext()) {
Element a = couna.next();
if (countIndex == 1) {
countIndex++;
county.setCode(Long.parseLong(a.text().trim()));
} else {
county.setName(a.text().trim());
}
}
county.setParentId(city.getId());
county.save();
}
}
}
}
4.请求接口
http://localhost/demo
我就惯着你们。。。(全国省市区sql文件,更新时间2019年10月31日)
链接: https://pan.baidu.com/s/1X2PDK4WL4dMB2UVcdU-_Mw
提取码: atzp

浙公网安备 33010602011771号