node cheerio爬虫

环境:node----v14.5.0

           vscode----2019

           依赖库

参考:老陈打码bilbili

const cheerio = require("cheerio")
const axios = require("axios")
const fs = require("fs")
const url = require("url")
const path = require("path")

let httpUrl = "https://www.doutula.com/article/list/?page=1"

//将延迟函数封装成promise对象
async function lcwait(milliseconds) {
    return new Promise(function(resolve, reject) {
        setTimeout(function() {
            resolve("成功执行延迟函数,延迟" + milliseconds)
        }, milliseconds)
    })
}
//主入口
async function spider() {
    let allPageNum = await getNum()
    for (let i = 1; i <= allPageNum; i++) {
        await lcwait(3000 * i)
        getListPage(i)
    }
}
//获取页面数

async function getNum() {
    res = await axios.get(httpUrl)
    let $ = cheerio.load(res.data)
    let btnLength = $('.pagination li').length;
    let allNum = $(".pagination li").eq(btnLength - 2).find('a').text()
    return allNum;
}
//页面分类创建文件夹
async function getListPage(pageNum) {
    let httpUrl = "https://www.doutula.com/article/list/?page=" + pageNum;
    let res = await axios.get(httpUrl)
    let $ = cheerio.load(res.data)
    $('#home .col-sm-9>a').each(async(i, element) => {
        let pageUrl = $(element).attr("href");
        let title = $(element).find(".random_title").text()
        let reg = /(.*?)\d/igs;
        title = reg.exec(title)[1];
        fs.mkdir('./img/' + title, function(err) {
            if (err) {

            } else {
                console.log("成功创建目录" + './img/' + title)
            }
        })
        await lcwait(50 * i)
        parsePage(pageUrl, title)
    })
}
//页面分类爬取
async function parsePage(pageUrl, title) {
    let res = await axios.get(pageUrl);
    let $ = cheerio.load(res.data)
    $('.pic-content img').each(async(i, element) => {
        let imgUrl = $(element).attr('src')
        extName = path.extname(imgUrl)
        await lcwait(50)
        let imgPath = `./img/${title}/${title}-${i}${extName}`
        let ws = fs.createWriteStream(imgPath)
        axios.get(imgUrl, { responseType: 'stream' }).then(function(res) {
            res.data.pipe(ws)
            console.log("图片加载完成" + imgPath)
            res.data.on('close', function() {
                ws.close()
            })
        })
    });
}
spider()
posted @ 2021-01-20 16:50  abcdefgab  阅读(113)  评论(0)    收藏  举报