var axios = require("axios")
var cheerio = require("cheerio")
var xlsx = require('node-xlsx');
var fs = require('fs');
var userAgentPool = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
];
var userAgentFlag = 0;
var urls = [];
var num = 1;
var data = [
{
name : 'sheet1',
data : [
[
'',
'疾病描述',
'疾病',
'病历概要'
]
]
}
]
let si1 = setInterval(getUrls, 1000)
console.log('开始爬取病症链接');
function getUrls() {
if(userAgentFlag === userAgentPool.length) userAgentFlag = 0;
if(num > 100) {
clearInterval(si1);
console.log('病症链接爬取完成,爬取数量:' + urls.length);
getDatas();
return;
}
axios.get("https://zixun.haodf.com/dispatched/45001000.htm?p=" + num++,
{
headers: {
'User-Agent': userAgentPool[userAgentFlag]
}
}).then(resp => {
var $ = cheerio.load(resp.data)
var lis = $('.clearfix li');
for (var i = 0; i < lis.length; i++) {
var li = lis.eq(i);
if(li.find(".fl a").attr("href")) {
urls.push(li.find(".fl a").attr("href"));
}
}
console.log('已爬取第', num - 1, '页', '总爬取数量:', 'urls:', urls.length, '该页末位链接:', urls[urls.length - 1]);
})
userAgentFlag++;
}
function getDatas() {
console.log('开始爬取具体数据');
num = 0;
si1 = setInterval(getItem, 100)
}
function getItem() {
if(userAgentFlag === userAgentPool.length) userAgentFlag = 0;
if(num === urls.length) {
clearInterval(si1);
var buffer = xlsx.build(data);
fs.writeFile('./res.xls', buffer, function (err)
{
if (err)
throw err;
console.log('Write to xls has finished');
})
return;
}
axios.get(urls[num],
{
headers: {
'User-Agent': userAgentPool[userAgentFlag]
}
}).then(resp => {
var $ = cheerio.load(resp.data)
var section = $('.bccard section').eq(0).find('.info3-value p');
data[0].data.push([
++num,
section.eq(0).text().trim(),
section.eq(2).text().trim(),
$('.suggestions-content .suggestions-text-value').text().trim()
])
console.log('爬取数据:', data[0].data[data[0].data.length - 1]);
}
)
userAgentFlag++;
}