nodejs 爬虫简单实现
前两天用node写了个简单定时器爬虫,这里记录一下
const path = require('path') const request = require('request') const cheerio = require('cheerio') const schedule = require('node-schedule') const fs = require('fs') const url = 'http://www.imooc.com/learn/' const menu = ['1297', '1298', '1299', '1300', '1301', '1302', '1303', '1304'] const rule = 30 const videoList = [] let item = 0 function fsWriteJson(ob, dir) { const start = fs.statSync(path.join(__dirname, dir)) if(start.isDirectory()){ try{ fs.statSync(path.join(__dirname, `${dir}/icon.json`)) fs.unlink(`${dir}/icon.json`, function(err) { if (err) { return console.error(err) } }); }catch(e) {} fs.writeFile(`${dir}/icon.json`, JSON.stringify(ob), function (err) { if (err) { throw err; } }) }else{ console.log('read error') } } function AsyncRequest(mathUrl, callback){ //promise return new Promise(function(resolve, reject){ //request请求 request(mathUrl, function (err, res) { // 初始化cheerio let $ = cheerio.load(res.body.toString()); //遍历 $('.video li a').each(function () { //title const title = asyncTitle($(this)); const text = asyncText($(this)); const time = asyncTime(text[1]); const item = { title: title[0], url: 'http://www.imooc.com' + $(this).attr('href'), name: text[0], duration: time[1] }; const s = asyncId(item); //fitler if (Array.isArray(s)) { item.id = s[1]; //callback callback(item) } }); resolve(videoList) }); }); } async function iconTraiee(url, callback) { //random const mathUrl = `${url}${menu[Math.floor(Math.random() * 8)]}` //request const immocData = await AsyncRequest(mathUrl, callback) //fs write const immocWrite = await fsWriteJson(immocData, 'markdir') return immocWrite } schedule.scheduleJob('30 * * * * *', function(){ try{ iconTraiee(url, function (data) { if (videoList) { videoList.push(data) item++ console.log(`第${item}条记录读取成功`) } }) }catch(error){ console.log('error',error) } }); //title function asyncTitle(str) { let title = str.parent().parent().parent().text().trim() return title.split('\n') } //text function asyncText(str) { let text = str.text().trim() return text.split('\n') } //time function asyncTime(str) { return str.match(/\((\d+\:\d+)\)/) } //id function asyncId(str) { return str.url.match(/video\/(\d+)/) }
node server.js跑下没问题