[node] 对某网站的简单爬虫
#异步流程控制目前看还有疑惑
var fs = require("fs"); var request = require('request'); var cheerio = require('cheerio'); var requrl = 'http://www.ruyile.com/xxlb.aspx?id=1&t=2'; var temp = ''; request(requrl, function(error, response, body) { if (!error && response.statusCode == 200) { acquireData(body); } }); function acquireData(data) { var $ = cheerio.load(data); var province = $('.qylb').toArray(); for (var i = 0; i < 31; i++) { var provinceData = province[0].children[i].children[0].data + ','; var provinceUrl = province[0].children[i].attribs.href; provinceQuery(provinceUrl, provinceData); } } function provinceQuery(provinceUrl, provinceData) { request(provinceUrl, function(error, response, body) { if (!error && response.statusCode == 200) { cityData(body, provinceData); } }) } function cityData(data1, cData) { var $ = cheerio.load(data1); var city = $('.qylb').toArray(); if (city[1].children.length > 0) { for (var i = 0; i < city[1].children.length; i++) { var cityHref = city[1].children[i].attribs.href; var transform = cData + city[1].children[i].children[0].data + ','; cityQuery(cityHref, transform); } } } function cityQuery(cityHref, trans) { request(cityHref, function(error, response, body) { if (!error && response.statusCode == 200) { districtData(body, trans); } }) } function districtData(data2, dData) { var $ = cheerio.load(data2); var city = $('.qylb').toArray(); if (city[2]) { if (city[2].children.length > 0) { for (var j = 0; j < city[2].children.length; j++) { var district = city[2].children[j].children[0].data + ',' || ' ,'; var transf = dData + district; var districtHref = city[2].children[j].attribs.href; districtQuery(districtHref, transf); } } } } function districtQuery(dishref, tran) { request(dishref, function(error, response, body) { if (!error && response.statusCode == 200) { streetData(body, tran); } }) } function streetData(data3, sData) { var $ = cheerio.load(data3); var add = $('.xxlb .sk h4').toArray(); if (add.length > 0) { for (var i = 0; i < add.length; i++) { var schoolNmae = '暂无'; var tel = '暂无'; var postal = '暂无'; var dis = '暂无'; if (add[i].children[0].children[0]) { schoolNmae = add[i].children[0].children[0].data; if (add[i].next) { if (add[i].next.data) { tel = (add[i].next.data).replace(/^.+:/, ''); } if (add[i].next.next && add[i].next.next.next && add[i].next.next.next.data) { if (add[i].next.next.next.data.indexOf('邮编') > -1) { postal = (add[i].next.next.next.data).replace(/^.+:/, ''); if (add[i].next.next.next.next.next && add[i].next.next.next.next.next.data) { dis = (add[i].next.next.next.next.next.data).replace(/^.+:/, ''); } } else { dis = (add[i].next.next.next.data).replace(/^.+:/, ''); } } } } temp = sData + schoolNmae + ',' + tel + ',' + postal + ',' + dis + '\n'; xxre(temp); } } } function xxre(temp) { // request('http://192.168.1.xx/xx/text.php?message=' + encodeURI(temp), function(error, response, body) { // if (!error && response.statusCode == 200) { // // } // }) fs.appendFile('spider.txt', temp, 'utf8', function(err) { if (err) { console.log(err); } }); }
#
var fs = require("fs"); var request = require('request'); var cheerio = require('cheerio'); var requrl = 'http://www.ruyile.com/xxlb.aspx?id=1&t=2'; var temp = ''; request(requrl, function(error, response, body) { if (!error && response.statusCode == 200) { acquireData(body); } }); function acquireData(data) { var $ = cheerio.load(data); var province = $('.qylb').toArray(); for (var i = 0; i < 31; i++) { if (i == 0 || i == 1 || i == 8 || i == 21) { continue; } //var i = 9; var provinceData = province[0].children[i].children[0].data + ','; var provinceUrl = province[0].children[i].attribs.href; provinceQuery(provinceUrl, provinceData); } } function provinceQuery(provinceUrl, provinceData) { request(provinceUrl, function(error, response, body) { if (!error && response.statusCode == 200) { cityData(body, provinceData); } }) } function cityData(data1, cData) { var $ = cheerio.load(data1); var city = $('.qylb').toArray(); if (city[1].children.length > 0) { for (var i = 0; i < city[1].children.length; i++) { //var i = 1; var cityHref = city[1].children[i].attribs.href; var transform = cData + city[1].children[i].children[0].data + ','; cityQuery(cityHref, transform); } } } function cityQuery(cityHref, trans) { request(cityHref, function(error, response, body) { if (!error && response.statusCode == 200) { districtData(body, trans); } }) } function districtData(data2, dData) { var $ = cheerio.load(data2); var city = $('.qylb').toArray(); if (city[2]) { if (city[2].children.length > 0) { for (var i = 0; i < city[2].children.length; i++) { //var i = 2; var district = city[2].children[i].children[0].data + ',' || ' ,'; var transf = dData + district; var districtHref = city[2].children[i].attribs.href; districtQuery(districtHref, transf); } } } } function districtQuery(dishref, tran) { request(dishref, function(error, response, body) { if (!error && response.statusCode == 200) { streetData(body, tran, dishref); } }) } function streetData(data3, sData, shref) { var $ = cheerio.load(data3); var add = $('.xxlb .sk h4').toArray(); var p = $('.zys').toArray(); console.log(sData); //console.log(shref); var nowPage = parseInt(p[0].prev.data); var totalPage = parseInt(p[0].children[0].data); var rhref = shref; if (nowPage < totalPage) { ++nowPage; if (rhref.indexOf('&p=') > -1) { rhref = rhref.replace(/&p=\d+$/, ''); rhref += '&p=' + nowPage; } else { rhref += '&p=' + nowPage; } districtQuery(rhref, sData); } if (add.length > 0) { for (var i = 0; i < add.length; i++) { var schoolNmae = '暂无'; var tel = '暂无'; var postal = '暂无'; var dis = '暂无'; if (add[i].children[0].children[0]) { schoolNmae = add[i].children[0].children[0].data; if (add[i].next) { if (add[i].next.data) { tel = (add[i].next.data).replace(/^.+:/, ''); } if (add[i].next.next && add[i].next.next.next && add[i].next.next.next.data) { if (add[i].next.next.next.data.indexOf('邮编') > -1) { postal = (add[i].next.next.next.data).replace(/^.+:/, ''); if (add[i].next.next.next.next.next && add[i].next.next.next.next.next.data) { dis = (add[i].next.next.next.next.next.data).replace(/^.+:/, ''); } } else { dis = (add[i].next.next.next.data).replace(/^.+:/, ''); } } } } temp = sData + schoolNmae + ',' + tel + ',' + postal + ',' + dis + '\n'; xxre(temp); } } } function xxre(temp) { fs.appendFile('wx.txt', temp, 'utf8', function(err) { if (err) { console.log(err); } }); }
浙公网安备 33010602011771号