nodejs 爬虫简单实现

前两天用node写了个简单定时器爬虫,这里记录一下

const path = require('path')
const request = require('request')
const cheerio = require('cheerio')
const schedule = require('node-schedule')
const fs = require('fs')
const url = 'http://www.imooc.com/learn/'
const menu = ['1297', '1298', '1299', '1300', '1301', '1302', '1303', '1304']
const rule = 30
const videoList = []

let item = 0

function fsWriteJson(ob, dir) {
  const start = fs.statSync(path.join(__dirname, dir))
  if(start.isDirectory()){
    try{
      fs.statSync(path.join(__dirname, `${dir}/icon.json`))
      fs.unlink(`${dir}/icon.json`, function(err) {
        if (err) {
          return console.error(err)
        }
      });
    }catch(e) {}
    fs.writeFile(`${dir}/icon.json`,
      JSON.stringify(ob), function (err) {
      if (err) {
        throw err;
      }
    })
  }else{
    console.log('read error')
  }
}

function AsyncRequest(mathUrl, callback){
  //promise
  return new Promise(function(resolve, reject){
    //request请求
    request(mathUrl, function (err, res) {
      // 初始化cheerio
      let $ = cheerio.load(res.body.toString());
      //遍历
      $('.video li a').each(function () {
         //title
        const title = asyncTitle($(this));
        const text = asyncText($(this));
        const time = asyncTime(text[1]);
        const item = {
          title: title[0],
          url: 'http://www.imooc.com' + $(this).attr('href'),
          name: text[0],
          duration: time[1]
        };
        const s = asyncId(item);
         //fitler
        if (Array.isArray(s)) {
          item.id = s[1];
          //callback
          callback(item)
        }
      });
      resolve(videoList)
    });
  });
}

async function iconTraiee(url, callback) {
  //random
  const mathUrl = `${url}${menu[Math.floor(Math.random() * 8)]}`
  //request
  const immocData = await AsyncRequest(mathUrl, callback)
  //fs write
  const immocWrite = await fsWriteJson(immocData, 'markdir')

  return immocWrite
}

schedule.scheduleJob('30 * * * * *', function(){
  try{
    iconTraiee(url, function (data) {
      if (videoList) {
        videoList.push(data)
        item++
        console.log(`第${item}条记录读取成功`)
      }
    })
  }catch(error){
    console.log('error',error)
  }
});

//title
function asyncTitle(str) {
    let title = str.parent().parent().parent().text().trim()
    return title.split('\n')
}
//text
function asyncText(str) {
    let text = str.text().trim()
    return text.split('\n')
}
//time
function asyncTime(str) {
    return str.match(/\((\d+\:\d+)\)/)
}
//id
function asyncId(str) {
    return str.url.match(/video\/(\d+)/)
}

node server.js跑下没问题  

posted @ 2021-05-26 09:21  叫我汤先森  阅读(203)  评论(0)    收藏  举报