node cheerio爬虫
环境:node----v14.5.0
vscode----2019
依赖库
参考:老陈打码bilbili
const cheerio = require("cheerio")
const axios = require("axios")
const fs = require("fs")
const url = require("url")
const path = require("path")
let httpUrl = "https://www.doutula.com/article/list/?page=1"
//将延迟函数封装成promise对象
async function lcwait(milliseconds) {
return new Promise(function(resolve, reject) {
setTimeout(function() {
resolve("成功执行延迟函数,延迟" + milliseconds)
}, milliseconds)
})
}
//主入口
async function spider() {
let allPageNum = await getNum()
for (let i = 1; i <= allPageNum; i++) {
await lcwait(3000 * i)
getListPage(i)
}
}
//获取页面数
async function getNum() {
res = await axios.get(httpUrl)
let $ = cheerio.load(res.data)
let btnLength = $('.pagination li').length;
let allNum = $(".pagination li").eq(btnLength - 2).find('a').text()
return allNum;
}
//页面分类创建文件夹
async function getListPage(pageNum) {
let httpUrl = "https://www.doutula.com/article/list/?page=" + pageNum;
let res = await axios.get(httpUrl)
let $ = cheerio.load(res.data)
$('#home .col-sm-9>a').each(async(i, element) => {
let pageUrl = $(element).attr("href");
let title = $(element).find(".random_title").text()
let reg = /(.*?)\d/igs;
title = reg.exec(title)[1];
fs.mkdir('./img/' + title, function(err) {
if (err) {
} else {
console.log("成功创建目录" + './img/' + title)
}
})
await lcwait(50 * i)
parsePage(pageUrl, title)
})
}
//页面分类爬取
async function parsePage(pageUrl, title) {
let res = await axios.get(pageUrl);
let $ = cheerio.load(res.data)
$('.pic-content img').each(async(i, element) => {
let imgUrl = $(element).attr('src')
extName = path.extname(imgUrl)
await lcwait(50)
let imgPath = `./img/${title}/${title}-${i}${extName}`
let ws = fs.createWriteStream(imgPath)
axios.get(imgUrl, { responseType: 'stream' }).then(function(res) {
res.data.pipe(ws)
console.log("图片加载完成" + imgPath)
res.data.on('close', function() {
ws.close()
})
})
});
}
spider()

浙公网安备 33010602011771号