【Java】爬取澳门区划信息
官网地址:
https://macaostreets.iam.gov.mo/zh_mo/freguesiaindex.html
大区部分是在页面展示的

点击发现并没有请求网络,所以数据是js中存在的

找到了展示街道方法,这一段:
使用大区id匹配上述变量的
function showStreets(freguesia){
var freguesiaStreets;
switch(freguesia){
case "fatima":
freguesiaStreets = fatimaStreets;
break;
case "lourenco":
freguesiaStreets = lourencoStreets;
break;
case "lazaro":
freguesiaStreets = lazaroStreets;
break;
case "carno":
freguesiaStreets = carnoStreets;
break;
case "se":
freguesiaStreets = seStreets;
break;
case "antonio":
freguesiaStreets = antionioStreets;
break;
case "xavier":
freguesiaStreets = xavierStreets;
break;
}
可以发现,这一段是js代码实现的,我可以通过Jsoup解析文档获取js的源代码部分
但是要怎么在Java读取JS变量呢?
意外发现JDK8自带提供API 【JS解析引擎】
但是有使用限制,只支持ES5的语法 ,并且不支持任何第三方库的语法内容
import javax.script.ScriptEngine; import javax.script.ScriptEngineManager;
网上的资料不多, 要边写边看才知道
@SneakyThrows
public static void main(String[] args) {
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("Nashorn");
engine.eval("var obj = { a: 100, b: true, c: 'hello js-engine'}");
ScriptObjectMirror jsObject = (ScriptObjectMirror) engine.get("obj");
String[] ownKeys = jsObject.getOwnKeys(false); /* false表示只需要一般属性, true表示全部属性,包括对象原型的属性 */
for (String ownKey : ownKeys) {
Object o = jsObject.get(ownKey);
System.out.println(o); /* 按具体类型强转即可。集合、对象类型,还是强转为ScriptObjectMirror来读取 */
}
}
db.setting数据源配置文件:
## db.setting文件 url = jdbc:mysql://localhost:3308/my-info?serverTimezone=Asia/Shanghai user = root pass = 123456 ## 可选配置 # 是否在日志中显示执行的SQL showSql = true # 是否格式化显示的SQL formatSql = false # 是否显示SQL参数 showParams = true # 打印SQL的日志等级,默认debug,可以是info、warn、error sqlLevel = debug #---------------------------------------------------------------------------------------------------------------- ## 连接池配置项 #———————————————— #版权声明:本文为CSDN博主「soulCoke」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 #原文链接:https://blog.csdn.net/qq_36328170/article/details/105687633 ## ---------------------------------------------------- Druid # 初始化时建立物理连接的个数。初始化发生在显示调用init方法,或者第一次getConnection时 initialSize = 1 # 最大连接池数量 maxActive = 8 # 最小连接池数量 minIdle = 0 # 获取连接时最大等待时间,单位毫秒。配置了maxWait之后, 缺省启用公平锁,并发效率会有所下降, 如果需要可以通过配置useUnfairLock属性为true使用非公平锁。 maxWait = 0 # 是否缓存preparedStatement,也就是PSCache。 PSCache对支持游标的数据库性能提升巨大,比如说oracle。 在mysql5.5以下的版本中没有PSCache功能,建议关闭掉。作者在5.5版本中使用PSCache,通过监控界面发现PSCache有缓存命中率记录, 该应该是支持PSCache。 poolPreparedStatements = false # 要启用PSCache,必须配置大于0,当大于0时, poolPreparedStatements自动触发修改为true。 在Druid中,不会存在Oracle下PSCache占用内存过多的问题, 可以把这个数值配置大一些,比如说100 maxOpenPreparedStatements = -1 # 用来检测连接是否有效的sql,要求是一个查询语句。 如果validationQuery为null,testOnBorrow、testOnReturn、 testWhileIdle都不会其作用。 validationQuery = SELECT 1 # 申请连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能。 testOnBorrow = true # 归还连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能 testOnReturn = false # 建议配置为true,不影响性能,并且保证安全性。 申请连接的时候检测,如果空闲时间大于 timeBetweenEvictionRunsMillis,执行validationQuery检测连接是否有效。 testWhileIdle = false # 有两个含义: 1) Destroy线程会检测连接的间隔时间 2) testWhileIdle的判断依据,详细看testWhileIdle属性的说明 timeBetweenEvictionRunsMillis = 60000 # 物理连接初始化的时候执行的sql connectionInitSqls = SELECT 1 # 属性类型是字符串,通过别名的方式配置扩展插件, 常用的插件有: 监控统计用的filter:stat 日志用的filter:log4j 防御sql注入的filter:wall # filters = stat # 类型是List<com.alibaba.druid.filter.Filter>, 如果同时配置了filters和proxyFilters, 是组合关系,并非替换关系 # proxyFilters =
工具类代码:
package cn.cloud9.chinese.mazu;
import cn.hutool.core.date.DateUtil;
import cn.hutool.db.Db;
import cn.hutool.db.Entity;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import jdk.nashorn.api.scripting.ScriptObjectMirror;
import lombok.Builder;
import lombok.Data;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.time.LocalDateTime;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.Map;
public class MaZuRegionUtil {
private static final Db db = Db.use();
private static String currentTableName = "";
/* 首页 */
private static final String INDEX_PAGE = "https://macaostreets.iam.gov.mo/zh_mo/freguesiaindex.html";
/* 官网根地址 */
private static final String ROOT_PATH = "https://macaostreets.iam.gov.mo";
@Data
@Builder
public static final class MaZuRegionPO {
private Integer id;
private Integer parentId;
private String name;
private Integer level;
private String link;
private String fullPath;
private String description;
private byte[] image;
private LocalDateTime genTime;
}
/* js 事件匹配数据对象 */
private static Map<String, String> REGION_MAP = new LinkedHashMap<String, String>(){{
this.put("fatima", "fatimaStreets");
this.put("lourenco", "lourencoStreets");
this.put("lazaro", "lazaroStreets");
this.put("carno", "carnoStreets");
this.put("se", "seStreets");
this.put("antonio", "antionioStreets");
this.put("xavier", "xavierStreets");
}};
@SneakyThrows
private static void writeToDB(MaZuRegionPO po) {
db.insert(
Entity.create(currentTableName)
.set("id", po.getId())
.set("parent_id", po.getParentId())
.set("full_path", po.getFullPath())
.set("name", po.getName())
.set("link", po.getLink())
.set("level", po.getLevel())
.set("description", po.getDescription())
.set("image", po.getImage())
.set("gen_time", LocalDateTime.now())
);
}
@SneakyThrows
public static void initialTableSpace() {
String format = "`macao-region-" + DateUtil.format(new Date(), "yyyyMMddHHmmss") + "`";
final String SQL =
"CREATE TABLE IF NOT EXISTS " + format + " (" +
" `id` int NOT NULL AUTO_INCREMENT COMMENT '主键'," +
" `parent_id` int NOT NULL COMMENT '上级id'," +
" `name` varchar(64) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称'," +
" `link` varchar(192) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '链接'," +
" `full_path` varchar(128) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '完整路径'," +
" `level` varchar(12) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '层级'," +
" `description` varchar(2048) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '描述'," +
" `image` MEDIUMBLOB DEFAULT NULL COMMENT '图片'," +
" `gen_time` datetime DEFAULT NULL COMMENT '记录创建时间'," +
" PRIMARY KEY (`id`)" +
") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='澳门区域表';";
db.execute(SQL, null);
currentTableName = format;
}
/**
* 解析script标签中的js代码
* @param targetScript
* @return
*/
@SneakyThrows
private static ScriptEngine readJavaScriptData(String targetScript) {
String targetJavaScriptCode = targetScript;
// System.out.println(targetJavaScriptCode);
/* 只保留前段变量代码 */
int index = targetJavaScriptCode.indexOf("function");
targetJavaScriptCode = targetJavaScriptCode.substring(0, index);
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("Nashorn");
engine.eval(targetJavaScriptCode);
return engine;
}
public static void goRead() {
initialTableSpace();
String string = HttpUtil.get(INDEX_PAGE);
// System.out.println(string);
Document document = Jsoup.parse(string);
/* 先解析目标JS代码变量 */
Elements scriptList = document.getElementsByTag("script");
Element targetScript = scriptList.get(9);
ScriptEngine scriptEngine = readJavaScriptData(targetScript.html());
/* 提取页面区域部分 */
Elements childList = document.select("#FreguesiaSectionText > [id]");
int idx = 100;
for (Element eachSector : childList) {
String idKey = eachSector.id();
MaZuRegionPO sector = MaZuRegionPO.builder()
.id(idx)
.name(eachSector.ownText())
.fullPath(eachSector.ownText())
.level(1)
.link(INDEX_PAGE)
.parentId(0)
.build();
// System.out.println(sector);
writeToDB(sector);
String jsDataKey = REGION_MAP.get(idKey);
ScriptObjectMirror streetsList = (ScriptObjectMirror) scriptEngine.get(jsDataKey);
String[] ownKeys = streetsList.getOwnKeys(false);
int c1 = 1;
for (String key : ownKeys) {
ScriptObjectMirror jsObject = (ScriptObjectMirror)streetsList.get(key);
String name = jsObject.get("name").toString();
String link = jsObject.get("link").toString(); // https://macaostreets.iam.gov.mo/ + link
link = ROOT_PATH + link;
String detailPage = HttpUtil.get(link);
Document detailDoc = Jsoup.parse(detailPage);
/* 读取简介 */
Element descElement = detailDoc.select(".SpotInfo > .SpotInfoText").get(0);
String text = descElement.html();
/* 读取目标图片 */
Element linkA = detailDoc.select("a[data-caption='" + name + "']").get(0);
String href = linkA.attr("href");
href = ROOT_PATH + href;
HttpResponse execute = HttpUtil.createGet(href).execute();
byte[] bodyBytes = execute.bodyBytes();
MaZuRegionPO street = MaZuRegionPO.builder()
.id(idx + c1)
.parentId(sector.getId())
.name(name)
.fullPath(sector.getFullPath() + " -> " + name)
.level(2)
.description(text)
.image(bodyBytes)
.link(link)
.build();
writeToDB(street);
c1 += 1;
}
idx += 100;
}
}
@SneakyThrows
public static void main(String[] args) {
goRead();
}
}

浙公网安备 33010602011771号