node爬虫
const cheerio = require('cheerio');
const superagent = require('superagent');
const request = require('request');
var fs = require('fs');
var _data = require('./public/data/frequency.json');
var result = require('./public/data/frequency-result.json');
var url = '';
var random = 1000;
var index = 0;
var _obj = {};
_obj.jia = [];
_obj.yi = [];
_obj.bing = [];
var getData = function(search){
clearTimeout(timeout);
random = Math.ceil(Math.random()*5)*1000;
superagent.get(url+encodeURI(search))
.set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8')
.set('Accept-Encoding', 'gzip, deflate')
.set('Accept-Language', 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7')
.set('Cache-Control', 'max-age=0')
.set('Connection', 'keep-alive')
.set('Cookie', '')
.set('Host', '')
.set('Referer', '')
.set('Upgrade-Insecure-Requests', '1')
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36')
.end(function (err, res) {
// 抛错拦截
if(err){
// console.log(err);
console.log('err');
}
var $ = cheerio.load(res.text);
console.log(index, search, $('.panel-heading p').eq(1).text());
var str = $('.panel-heading p').eq(1).text().trim();
var index_1 = str.indexOf('共');
var index_2 = str.indexOf('个');
var num = str.substr(index_1+2, index_2-3);
var _o = {};
_o[search] = num;
_obj.bing.push(_o);
fs.writeFile('public/data/frequency-result.json', JSON.stringify(_obj), function (err) {
if (err) throw err;
console.log('It\'s saved!');
});
++index;
if(index < 500){
timeout = setTimeout(function(){
getData(_data.bing[index])
}, random);
}
});
}
var timeout = setTimeout(function(){
getData(_data.bing[index])
}, random);
var processExcel = function(){
var xlsx=require('node-xlsx');
var _path = "./public/data/frequency.xlsx";
var obj = xlsx.parse(_path);
var path = require('path');
function filterData(data){
console.log(data[0][1]);
var _obj = {};
_obj.jia = [];
_obj.yi = [];
_obj.bing = [];
for(var i=1; i<data.length; i++){
if(data[i][1]){
_obj.jia.push(data[i][1])
}
if(data[i][5]){
_obj.yi.push(data[i][5])
}
if(data[i][9]){
_obj.bing.push(data[i][9])
}
}
console.log(_obj.jia.length, _obj.yi.length, _obj.bing.length);
fs.writeFile('public/data/frequency.json', JSON.stringify(_obj), function (err) {
if (err) throw err;
console.log('It\'s saved!');
});
}
filterData(obj[0].data);
}
// processExcel();
以上是简单的爬虫,采用superagent获取html,用cheerio处理html,然后采用jquery的方式获取元素。
浙公网安备 33010602011771号