nodejs 爬虫简单实现
前两天用node写了个简单定时器爬虫,这里记录一下
const path = require('path')
const request = require('request')
const cheerio = require('cheerio')
const schedule = require('node-schedule')
const fs = require('fs')
const url = 'http://www.imooc.com/learn/'
const menu = ['1297', '1298', '1299', '1300', '1301', '1302', '1303', '1304']
const rule = 30
const videoList = []
let item = 0
function fsWriteJson(ob, dir) {
const start = fs.statSync(path.join(__dirname, dir))
if(start.isDirectory()){
try{
fs.statSync(path.join(__dirname, `${dir}/icon.json`))
fs.unlink(`${dir}/icon.json`, function(err) {
if (err) {
return console.error(err)
}
});
}catch(e) {}
fs.writeFile(`${dir}/icon.json`,
JSON.stringify(ob), function (err) {
if (err) {
throw err;
}
})
}else{
console.log('read error')
}
}
function AsyncRequest(mathUrl, callback){
//promise
return new Promise(function(resolve, reject){
//request请求
request(mathUrl, function (err, res) {
// 初始化cheerio
let $ = cheerio.load(res.body.toString());
//遍历
$('.video li a').each(function () {
//title
const title = asyncTitle($(this));
const text = asyncText($(this));
const time = asyncTime(text[1]);
const item = {
title: title[0],
url: 'http://www.imooc.com' + $(this).attr('href'),
name: text[0],
duration: time[1]
};
const s = asyncId(item);
//fitler
if (Array.isArray(s)) {
item.id = s[1];
//callback
callback(item)
}
});
resolve(videoList)
});
});
}
async function iconTraiee(url, callback) {
//random
const mathUrl = `${url}${menu[Math.floor(Math.random() * 8)]}`
//request
const immocData = await AsyncRequest(mathUrl, callback)
//fs write
const immocWrite = await fsWriteJson(immocData, 'markdir')
return immocWrite
}
schedule.scheduleJob('30 * * * * *', function(){
try{
iconTraiee(url, function (data) {
if (videoList) {
videoList.push(data)
item++
console.log(`第${item}条记录读取成功`)
}
})
}catch(error){
console.log('error',error)
}
});
//title
function asyncTitle(str) {
let title = str.parent().parent().parent().text().trim()
return title.split('\n')
}
//text
function asyncText(str) {
let text = str.text().trim()
return text.split('\n')
}
//time
function asyncTime(str) {
return str.match(/\((\d+\:\d+)\)/)
}
//id
function asyncId(str) {
return str.url.match(/video\/(\d+)/)
}
node server.js跑下没问题

浙公网安备 33010602011771号