无头浏览器-puppeteer
Puppeteer
puppeteer是一个node库,提供了一组用来操纵Chrome的API(默认headless也就是无UI的chrome,也可以配置为有UI)
有点类似于PhantomJS,但Puppeteer是Chrome官方团队进行维护的,前景更好。
使用Puppeteer,相当于同时具有Linux和Chrome的能力,应用场景会非常多。就爬虫领域来说,远比一般的爬虫工具功能更丰富,性能分析、自动化测试也不在话下。
简单用法
// 依赖
const puppeteer = require('puppeteer')
const request = require('request')
const { createWriteStream } = require('fs')
async function run(val) {
const browser = await puppeteer.launch({
headless: false
})
console.log('打开浏览器')
const page = await browser.newPage()
console.log('新建页面')
await page.goto('http://image.baidu.com/')
console.log('进入一个网站')
await page.waitForSelector('html')
console.log('等待元素加载')
await page.type('#kw', val)
console.log('输入成功')
await page.click('#homeSearchForm > span.s_search')
console.log('点击搜索')
await page.waitForSelector('.imgbox > a')
console.log('等待加载')
let urls = await page.$$eval('.imgbox > a', as => as.map(a => a.href))
console.log('1')
for (var i = 0, len = urls.length; i < len; i++) {
await page.goto(urls[i])
console.log('进入图片页面')
await downloadImg(page, i)
}
await browser.close()
}
async function downloadImg(page, index) {
await page.waitForSelector('.currentImg')
console.log('页面加载完毕')
let src = await page.evaluate(function () {
let img = document.querySelector('.currentImg')
return img.src
})
console.log('图片链接:'+src)
await download(src,`./images/${index}` + src.substr(-4, 4))
console.log('下载完成')
}
function download (path, name) {
return new Promise ((resolve, reject) => {
let ws = new createWriteStream(name)
ws.on('finish', function () {
ws.end()
resolve()
})
ws.on('error',reject)
request({
url: path,
headers: {
'Referer': 'no-referrer-when-downgrade',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
}).pipe(ws)
})
}
run('壁纸')

浙公网安备 33010602011771号