java+phantomjs实现动态网页抓取
1.下载地址:http://phantomjs.org/download.html
2.java代码
public void getHtml(String url)
{
HTML="";
String jsPath = "C:\\phantomjs\\examples\\myjs.js";
String exePath = "C:\\phantomjs\\bin\\phantomjs.exe";
System.out.println(jsPath);
System.out.println(exePath);
Runtime rt = Runtime.getRuntime();
Process p;
try {
p = rt.exec(exePath + " " + jsPath + " " + url);
InputStream is = p.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer sbf = new StringBuffer();
String tmp = "";
while ((tmp = br.readLine()) != null)
{
sbf.append(tmp);
}
HTML=sbf.toString();
is.close();
br.close();
sbf=null;
is=null;
br=null;
} catch (IOException e) {
e.printStackTrace();
}
}
3.js
var page = require('webpage').create(), system = require('system'), t, address; page.settings.loadImages = false; //为了提升加载速度,不加载图片 page.settings.resourceTimeout = 10000;//超过10秒放弃加载 //此处是用来设置截图的参数。不截图没啥用 page.viewportSize = { width: 1280, height: 800 }; block_urls = ['baidu.com'];//为了提升速度,屏蔽一些需要时间长的。比如百度广告 page.onResourceRequested = function(requestData, request){ for(url in block_urls) { if(requestData.url.indexOf(block_urls[url]) !== -1) { request.abort(); return; } } } address = system.args[1]; page.open(address, function(status) { if (status !== 'success') { console.log('FAIL to load the address'); } else { console.log(page.content); setTimeout(function(){ phantom.exit(); }, 6000); } phantom.exit(); });
浙公网安备 33010602011771号