node爬虫进阶版

手写了一个方便爬虫的小库：

const url = require('url')
const glib = require('zlib')

//默认头部
const _default_headers = {
    'Accept-Encoding': 'gzip, deflate, br',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}

//options(url,method,header)--http头部信息 isDebug--是否开启调试状态
module.exports = function(options, isDebug) {
    if(typeof options === "string") {
        options = {
            url: options,
            method: 'GET',
            headers: {}
        }
    } else {
        options = options || {}
        options.method = options.method || 'GET'
        options.headers = options.headers || {}
    }
    options.headers = Object.assign(_default_headers, options.headers)
    
    
    function debug(msg) {
        if(isDebug) {
            console.log(msg)
        }
    }

    return new Promise((resolve, reject) => {
        req(options)

        function req(options) {
            //判断是http还是https
            let urlObj = url.parse(options.url)
            let mod = null
            port = 0

            if(urlObj.protocol == 'https:') {
                mod = require('https')
                port = 443
            } else {
                mod = require('http')
                port = 80
            }

            let _req_options = {
                hostname: urlObj.hostname,
                port,
                path: urlObj.path,
                method: options.method,
                headers: options.headers
            }
            //开始模拟，爬取信息
            let req_obj = mod.request(_req_options, (res) => {
                if(res.statusCode!==200) {
                    //如果是重定向则重新在请求
                    if(res.statusCode == 301 || res.statusCode === 302) {
                        options.url = res.headers.location
                        debug('重定向: '+res.headers.location)
                        req(options)
                    } else {
                        reject(res.statusCode)
                    }
                } else {
                    //statusCode是200时接受data buffer
                    let data = []
                    res.on('data', buffer => {
                        data.push(buffer)
                    })
                    res.on('end', () =>{
                        let buffer = Buffer.concat(data)
                        //判断是否传输有误
                        if (res.headers['content-length'] != buffer.length) {
                            debug('收到数据有误，正在重新获取')
                            req(options)
                        }
                        //判断是否有用gzip
                        else if (res.headers['content-encoding'] && res.headers['content-encoding'].includes('gzip')) {
                           buffer = glib.gunzip(buffer, (err,data) => {
                               debug('gzip解压完成并成功返回')
                               resolve(data)
                           })
                        } else {
                            debug('成功返回')
                            resolve(buffer)                     
                        }
                    })
                }
            })
            req_obj.on('error', err => {
                debug('爬虫失败')
                reject(err)
            })
            req_obj.end()
        }
    })
}

require进来然后传入url或者options,就可以得到爬虫后返回的promise了

举个例子：

我要爬个bilibili的视频：

const url = require('url')
const fs = require('fs')

function getVideo(options, headers, fileName) {
    if(typeof options === "string") {
        options = {
            url: options,
            method: 'GET',
            headers: {},
            timeout: 2000
        }
    } else {
        options = options || {}
        options.method = options.method || 'GET'
        options.headers = options.headers || {}
        options.timeout = options.timeout || 2000
    }
    options.headers = headers

    return new Promise((resolve, reject) => {
        req(options)

        function req(options) {
            //判断是http还是https
            let urlObj = url.parse(options.url)
            let mod = null
            port = 0

            if(urlObj.protocol == 'https:') {
                mod = require('https')
                port = 443
            } else {
                mod = require('http')
                port = 80
            }

            let _req_options = {
                hostname: urlObj.hostname,
                port,
                path: urlObj.path,
                method: options.method,
                headers: options.headers,
                timeout: options.timeout
            }
            //开始模拟，爬取信息
            let req_obj = mod.request(_req_options, (res) => {
                // 视频路径
                const filePath = `${__dirname}/${fileName}`;
                if (fs.existsSync(filePath)) {
                    fs.unlinkSync(filePath)
                }
                res.on('data', buffer => {
                    fs.appendFileSync(filePath, buffer)
                    const size = fs.statSync(filePath).size;
                    console.log(`已下载${(size / 1024 / 1024).toFixed(2)}MB,完成${(size/res.headers['content-length'] * 100).toFixed(2)}%`)
                })
                res.on('end', () =>{
                    resolve()                     
                })
            })
            req_obj.on('error', err => {
                debug('爬虫失败')
                reject(err)
            })
            req_obj.end()
        }
    })
}

// 生成文件名
const fileName = '1.flv'
// 链接
const videoUrl = 'https://cn-sdyt-cu-v-05.acgvideo.com/upgcxcode/66/83/34548366/34548366-1-64.flv?expires=1545405600&platform=pc&ssig=ElhY4A2e-U4R2m8EI1eiGQ&oi=1928611810&nfa=uTIiNt+AQjcYULykM2EttA==&dynamic=1&hfa=2116953847&hfb=Yjk5ZmZjM2M1YzY4ZjAwYTMzMTIzYmIyNWY4ODJkNWI=&trid=45c5fdc464354b71bf599c224b7df8ea&nfb=maPYqpoel5MI3qOUX6YpRA==&nfc=1';
// 头部
const header = {
    'Origin': 'https://www.bilibili.com',
    'Referer': 'https://www.bilibili.com/video/av21061574',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}

getVideo(videoUrl, header, fileName).then(res => {
    console.log('写入成功');
})

posted @ 2018-03-13 21:04 张啊咩阅读(282) 评论(0) 收藏举报

刷新页面返回顶部

张啊咩

node爬虫进阶版

公告