puppeteer爬虫服务

爬虫文件
baidu.js

const puppeteer = require("puppeteer");
const path = require('path');
const pathToExtension = path.join(__dirname, './chrome-mac/Chromium.app/Contents/MacOS/Chromium');
var exec = require('child_process').execSync;
const conf = {
    headless: false,
    executablePath: pathToExtension,
    defaultViewport: {
        width: 1300,
        height: 900
    },
};

const run = async (browserEndpoint) => {
    //var count = exec('ps -ef |grep Chromium |grep -v "grep" |awk \'{print $8}\'|wc -l');
    if (browserEndpoint == "") {
        var browser = await puppeteer.launch(conf)
        const _browserEndpoint = await await browser.wsEndpoint();
        console.log("_browserEndpoint",_browserEndpoint)
        browserEndpoint=_browserEndpoint
    }

    var browser = await puppeteer.connect({"browserWSEndpoint":browserEndpoint})
    const page = await browser.newPage()
    await page.goto('https://www.baidu.com/', {waitUntil: 'networkidle2'});
    //addScriptTag需要加在goto的后面,然后就可以在evaluate里使用jQuery的语法了。
    await page.addScriptTag({
        url: 'https://code.jquery.com/jquery-3.2.1.min.js',
    });
    await page.waitFor('#u1')
    // 可以接收evaluate内部打印的console内容
    page.on('console', msg => {
        for (let i = 0; i < msg.args().length; i++) {
            console.log(`${i}: ${msg.args([i])}`)
        }
    })
    const result = await page.evaluate(() => {
        let data = []; // 初始化空数组来存储数据
        let elements = $("#u1"); // 获取所有元素
        for (let element of elements) {
            let title = element.innerText; // 获取标题
            let url = element.href;//获取网址
            data.push({title, url}); // 存入数组
        }
        return data;
    });
    console.log(result);
    const dic = {
        "result": result,
        "browserEndpoint": browserEndpoint
    }
    await page.close()
    return dic


};

module.exports = {
    run
}

服务文件
server.js

var http = require('http');

var run_spider = require("./baidu.js");
var browserEndpoint = ""

http.createServer(function (req, res) {
    console.log("in", browserEndpoint)
    res.writeHead(200, {'Content-Type': 'text/plain;charset=UTF-8', 'Access-Control-Allow-origin': '*'});
    if (req.method.toUpperCase() == 'POST') {
        var postData = '';
        req.on('data', function (data) {
            postData += data; //接受的数据
        });
        req.on('end', function () {
                if (browserEndpoint == "") {
                    console.log("if", browserEndpoint)

                    run_spider.run(browserEndpoint).then(function (result_dict) {
                        browserEndpoint = result_dict.browserEndpoint
                        console.log("browserEndpoint", browserEndpoint)
                    })

                } else {
                    console.log("else", browserEndpoint)
                    run_spider.run(browserEndpoint).then(function (result_dict) {
                        Promise.resolve(result_dict.browserEndpoint)
                    })
                }

            }
        );
    }
}).listen('9001', function () {
    console.log('开启服务端口9001');
});
posted @ 2019-08-09 14:16  公众号python学习开发  阅读(426)  评论(0编辑  收藏  举报