晴明的博客园 GitHub      CodePen      CodeWars     

[node] 对某网站的简单爬虫

#异步流程控制目前看还有疑惑

var fs = require("fs");
var request = require('request');
var cheerio = require('cheerio');
var requrl = 'http://www.ruyile.com/xxlb.aspx?id=1&t=2';
var temp = '';
request(requrl, function(error, response, body) {
    if (!error && response.statusCode == 200) {
        acquireData(body);
    }
});

function acquireData(data) {
    var $ = cheerio.load(data);
    var province = $('.qylb').toArray();
    for (var i = 0; i < 31; i++) {
        var provinceData = province[0].children[i].children[0].data + ',';
        var provinceUrl = province[0].children[i].attribs.href;
        provinceQuery(provinceUrl, provinceData);
    }
}

function provinceQuery(provinceUrl, provinceData) {
    request(provinceUrl, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            cityData(body, provinceData);
        }
    })
}

function cityData(data1, cData) {
    var $ = cheerio.load(data1);
    var city = $('.qylb').toArray();
    if (city[1].children.length > 0) {
        for (var i = 0; i < city[1].children.length; i++) {
            var cityHref = city[1].children[i].attribs.href;
            var transform = cData + city[1].children[i].children[0].data + ',';
            cityQuery(cityHref, transform);
        }
    }
}

function cityQuery(cityHref, trans) {
    request(cityHref, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            districtData(body, trans);
        }
    })
}

function districtData(data2, dData) {
    var $ = cheerio.load(data2);
    var city = $('.qylb').toArray();
    if (city[2]) {
        if (city[2].children.length > 0) {
            for (var j = 0; j < city[2].children.length; j++) {
                var district = city[2].children[j].children[0].data + ',' || ' ,';
                var transf = dData + district;
                var districtHref = city[2].children[j].attribs.href;
                districtQuery(districtHref, transf);
            }
        }
    }
}

function districtQuery(dishref, tran) {
    request(dishref, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            streetData(body, tran);
        }
    })
}

function streetData(data3, sData) {
    var $ = cheerio.load(data3);
    var add = $('.xxlb .sk h4').toArray();
    if (add.length > 0) {
        for (var i = 0; i < add.length; i++) {
            var schoolNmae = '暂无';
            var tel = '暂无';
            var postal = '暂无';
            var dis = '暂无';
            if (add[i].children[0].children[0]) {
                schoolNmae = add[i].children[0].children[0].data;
                if (add[i].next) {
                    if (add[i].next.data) {
                        tel = (add[i].next.data).replace(/^.+:/, '');
                    }
                    if (add[i].next.next && add[i].next.next.next && add[i].next.next.next.data) {
                        if (add[i].next.next.next.data.indexOf('邮编') > -1) {
                            postal = (add[i].next.next.next.data).replace(/^.+:/, '');
                            if (add[i].next.next.next.next.next && add[i].next.next.next.next.next.data) {
                                dis = (add[i].next.next.next.next.next.data).replace(/^.+:/, '');
                            }
                        } else {
                            dis = (add[i].next.next.next.data).replace(/^.+:/, '');
                        }
                    }
                }
            }
            temp = sData + schoolNmae + ',' + tel + ',' + postal + ',' + dis + '\n';
            xxre(temp);
        }
    }

}

function xxre(temp) {
    //    request('http://192.168.1.xx/xx/text.php?message=' + encodeURI(temp), function(error, response, body) {
    //        if (!error && response.statusCode == 200) {
    //
    //        }
    //    })
    fs.appendFile('spider.txt', temp, 'utf8', function(err) {
        if (err) {
            console.log(err);
        }
    });
}

 #

var fs = require("fs");
var request = require('request');
var cheerio = require('cheerio');
var requrl = 'http://www.ruyile.com/xxlb.aspx?id=1&t=2';
var temp = '';
request(requrl, function(error, response, body) {
    if (!error && response.statusCode == 200) {
        acquireData(body);
    }
});

function acquireData(data) {
    var $ = cheerio.load(data);
    var province = $('.qylb').toArray();
    for (var i = 0; i < 31; i++) {
        if (i == 0 || i == 1 || i == 8 || i == 21) {
            continue;
        }
        
        //var i = 9;
        var provinceData = province[0].children[i].children[0].data + ',';
        var provinceUrl = province[0].children[i].attribs.href;
        provinceQuery(provinceUrl, provinceData);
    }
}

function provinceQuery(provinceUrl, provinceData) {
    request(provinceUrl, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            cityData(body, provinceData);
        }
    })
}

function cityData(data1, cData) {
    var $ = cheerio.load(data1);
    var city = $('.qylb').toArray();
    if (city[1].children.length > 0) {
        for (var i = 0; i < city[1].children.length; i++) {
            //var i = 1;
            
            var cityHref = city[1].children[i].attribs.href;
            var transform = cData + city[1].children[i].children[0].data + ',';
            
            cityQuery(cityHref, transform);
        }
    }
}

function cityQuery(cityHref, trans) {
    request(cityHref, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            districtData(body, trans);
        }
    })
}

function districtData(data2, dData) {
    var $ = cheerio.load(data2);
    var city = $('.qylb').toArray();
    if (city[2]) {
        if (city[2].children.length > 0) {

            for (var i = 0; i < city[2].children.length; i++) {
                //var i = 2;
                var district = city[2].children[i].children[0].data + ',' || ' ,';
                var transf = dData + district;
                var districtHref = city[2].children[i].attribs.href;
                districtQuery(districtHref, transf);
            }
        }
    }
}

function districtQuery(dishref, tran) {
    request(dishref, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            streetData(body, tran, dishref);
        }
    })
}

function streetData(data3, sData, shref) {
    var $ = cheerio.load(data3);
    var add = $('.xxlb .sk h4').toArray();
    var p = $('.zys').toArray();
    console.log(sData);
    //console.log(shref);
    var nowPage = parseInt(p[0].prev.data);
    var totalPage = parseInt(p[0].children[0].data);
    var rhref = shref;
    if (nowPage < totalPage) {
        ++nowPage;
        if (rhref.indexOf('&p=') > -1) {
            rhref = rhref.replace(/&p=\d+$/, '');
            rhref += '&p=' + nowPage;
        } else {
            rhref += '&p=' + nowPage;
        }
        districtQuery(rhref, sData);

    }
    if (add.length > 0) {

        for (var i = 0; i < add.length; i++) {
            var schoolNmae = '暂无';
            var tel = '暂无';
            var postal = '暂无';
            var dis = '暂无';
            if (add[i].children[0].children[0]) {
                schoolNmae = add[i].children[0].children[0].data;
                if (add[i].next) {
                    if (add[i].next.data) {
                        tel = (add[i].next.data).replace(/^.+:/, '');
                    }
                    if (add[i].next.next && add[i].next.next.next && add[i].next.next.next.data) {
                        if (add[i].next.next.next.data.indexOf('邮编') > -1) {
                            postal = (add[i].next.next.next.data).replace(/^.+:/, '');
                            if (add[i].next.next.next.next.next && add[i].next.next.next.next.next.data) {
                                dis = (add[i].next.next.next.next.next.data).replace(/^.+:/, '');
                            }
                        } else {
                            dis = (add[i].next.next.next.data).replace(/^.+:/, '');
                        }
                    }
                }
            }
            temp = sData + schoolNmae + ',' + tel + ',' + postal + ',' + dis + '\n';
            xxre(temp);
        }
    }

}

function xxre(temp) {
    fs.appendFile('wx.txt', temp, 'utf8', function(err) {
        if (err) {
            console.log(err);
        }
    });
}

 

posted @ 2016-03-29 19:08  晴明桑  阅读(189)  评论(0)    收藏  举报