首图 首都图书馆 国家图书馆 自动查借阅书籍 脚本puppeteer汇总

(15条消息) puppeteer_彭争杰的博客-CSDN博客

api: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md

examples: https://github.com/puppeteer/puppeteer/tree/main/examples

 阮一峰:js 异步操作 - JavaScript 教程 - 网道 (wangdoc.com) 

      Promise 对象 - ECMAScript 6入门 (ruanyifeng.com)

 Node: Puppeteer + 图像识别 实现百度指数爬虫 - 掘金 (juejin.cn)

Puppeteer 用来做爬虫太 Low 了!但用在这里很合适! - 掘金 (juejin.cn)

使用puppeteer控制浏览器中视频播放 - 掘金 (juejin.cn)

奶奶都能轻松入门的 Puppeteer 教程 - 掘金 (juejin.cn)

 

cnpm install puppeteer-core   默认不自带浏览器

cnpm install puppeteer -S

国家图书馆 借书查询 国图

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch({
        executablePath: "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe",//process.env.CHROME_PATH, // Path to chromium build with autofill domain
        headless: true,
        args: ['--start-maximized'],
        defaultViewport: { width: 1800, height: 1000 },
        slowMo: 0
    });

    async function visitNationalLibrary(card, name) {
        const page = await browser.newPage();

        // 添加控制台监听器
        //page.on('console', msg => console.log('- [Browser Console Log]', msg.text()));

        const session = await page.target().createCDPSession();

        console.log('---------------------------')
        console.log(card + ' ' + name)
        await page.goto("http://opac.nlc.cn/F", {
            waitUntil: 'load', // Remove the timeout
            timeout: 0
        });

        try {
            await page.waitForXPath("/html/body/form/center/div/table/tbody/tr/td[2]/input")
            await page.type("input[name=bor_id]", card, { delay: 100 })
            await page.type("input[name=bor_verification]", secret)
            await page.$eval('form[name=form1', form => form.submit());

            // await page.waitForTimeout(500)
            await page.waitForXPath('//*[@id="history"]/a[1]/table/tbody/tr[1]/td[2]/a')


            // 检查元素的值并决定是否点击
            const shouldClick = await page.evaluate(() => {
                const element = document.querySelector('#history > a:nth-child(1) > table > tbody > tr:nth-child(1) > td.td1 > a');
                if (element) {
                    const text = element.textContent.trim();
                    //console.log("you have books?", parseInt(text, 10)); // 这行仍然会输出到浏览器控制台
                    return { shouldClick: parseInt(text, 10) > 0, text: text };
                }
                return { shouldClick: false, text: null };
            });
            
            if (!shouldClick.shouldClick) {
                console.log("No Book."); // 这行会输出到 Node.js 控制台
                return [];
            }
            
            await page.click('#history > a:nth-child(1) > table > tbody > tr:nth-child(1) > td.td1 > a');
            await page.waitForSelector('#baseinfo > center > table:nth-child(6) > tbody > tr.tr1 > th:nth-child(5)')
            //await page.waitForSelector('#baseinfo > center > table:nth-child(6) > tbody > tr:nth-child(2)')
            const data0 = await page.evaluate(() => {
              //const tds = Array.from(document.querySelectorAll('#baseinfo > center > table:nth-child(6) tr'));
                const tds = Array.from(document.querySelectorAll('#baseinfo > center > table:nth-child(6) > tbody > tr:nth-child(n+2)'));
                return tds.map(tr => {
                    const cells = Array.from(tr.querySelectorAll('td'));
                    console.log('Data inside evaluate:', cells.length); 
                    if (cells.length > 0) {
                        return {
                            "题名": cells[3]?.innerText.trim(),
                            "应还日期": cells[5]?.innerText.trim(),
                        };
                    }
                }).filter(Boolean);
            });
            console.log(data0) //输出借的书

        } catch (err) {
            console.error(err)
        }
        await page.close();
    }

    var names = new Map()
    // 添加数据到Map中
    /**/
    names.set("8888888888888888", "user1");
    names.set("8888888888888888", "user2");
    const secret="808080";
    for (let [card, name] of names) {
        await visitNationalLibrary(card, name);
    }

    await browser.close();
})().catch(e => { console.error(e) });

首都图书馆自动登录查询脚本

需要安装 tesseract.js

cnpm install tesseract.js (用淘宝国内源 cnpm)

const puppeteer = require('puppeteer');
const fs = require('fs');
const Tesseract = require('tesseract.js');

//browser path: "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
//              "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe" 
const chrome_path = "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe"

// 验证码处理
async function handleCaptcha(page) {
    let loginSuccess = false;
    let maxAttempts = 5;
    for (let attempt = 0; attempt < maxAttempts && !loginSuccess; attempt++) {

        // 使用 fetch API 下载验证码图片
        const captchaImage = await page.$eval('#loginform-verifycode-image', img => img.src);
        const buffer = await page.evaluate(async (src) => {
            const response = await fetch(src);
            const buffer = await response.arrayBuffer();
            return Array.from(new Uint8Array(buffer));
        }, captchaImage);

        // 转换为 Node.js Buffer
        const imageBuffer = Buffer.from(buffer);

        // 调用Tesseract.js进行OCR识别
        const { data: { text } } = await Tesseract.recognize(
            imageBuffer, // 图像数据,可以是Buffer或Uint8Array
            'eng', // 语言代码(英语)
            {
                //logger: info => console.log(info) // 可选的日志输出
            }
        );

        console.log('识别结果:', text);
        if (text == "") {
            console.log("识别结果为空,马上重试!");
            continue;
        }
        // 输入识别结果到输入框
        await page.waitForXPath("/html/body/div/div[2]/div/form/div[3]/div/div[1]/input")
        await page.$eval("input[id=loginform-verifycode]", input => input.value = "");
        await page.type("input[id=loginform-verifycode]", text, { delay: 10 })

        // 等待验证码检查结果,执行一会儿再去检查,检查是否出现了错误信息
        await page.waitForTimeout(2000);
        const errorElement = await page.$('.error');
        if (errorElement) {
            const errorStyle = await page.evaluate(el => el.style.display, errorElement);
            //console.log(errorStyle)
            if (errorStyle !== 'none') {
                console.log('验证码不正确,请刷新后重试');
                // 点击验证码图片,获取新的验证码
                await page.click('#loginform-verifycode-image');
                await page.waitForTimeout(2000); // 等待一段时间让图片加载完成
                console.log(`登录失败,已刷新。正在尝试第 ${attempt + 2} 次...`);
            } else {
                loginSuccess = true;
            }
        } else {
            // 如果没有找到 .error 元素,我们认为页面已经跳转了。就是认证通过了。
            loginSuccess = true;
        }

        /* 竞争,获取结果
        const errorElement = await page.waitForSelector('.error', { timeout: 3000 }).catch(() => null);
        const navigationPromise = page.waitForNavigation().catch(e => null);
        const elementPromise = page.waitForSelector("#borInfo > div.borrows > div > div:nth-child(1) > ul > button").catch(e => null);
    
        const result = await Promise.race([navigationPromise, elementPromise,errorElement]);

        //page.click('button[name=login-button]')
    
        if (result === navigationPromise) {
            console.log('跳转了');
            loginSuccess = true;
    
        } else if (result === elementPromise) {
            console.log('查看出来了');
            loginSuccess = true;
        } else  if (errorElement) {
            console.log('验证码不正确,请刷新后重试');

            // 点击验证码图片,获取新的验证码
            await page.click('#loginform-verifycode-image');
            await page.waitForTimeout(2000); // 等待一段时间让图片加载完成
            console.log(`登录失败,正在尝试第 ${attempt + 2} 次...`);
        } else  {
            loginSuccess = true;
            
        }
        */
    }//for
    console.log(`验证码验证结果:`, loginSuccess);
    return loginSuccess;
}

async function showTable(page) {
    const data = await page.$$eval('#w0 table.table tbody tr', rows => {
        return rows.map(row => {
            const properties = {};
            const secondTd = row.querySelector('td:nth-child(2)');
            const fourthTd = row.querySelector('td:nth-child(4)');
            properties.title = secondTd ? secondTd.innerText : null;
            properties.dueDate = fourthTd ? fourthTd.innerText : null;
            return properties;
        });
    });

    console.log(data);
}

//----------------- main ------------------------------------
(async () => {
    const browser = await puppeteer.launch({
        executablePath: chrome_path,
        //process.env.CHROME_PATH, // Path to chromium build with autofill domain
        headless: true,
        args: ['--start-maximized', '--no-sandbox', '--disable-setuid-sandbox'],
        defaultViewport: { width: 1800, height: 1000 },
        slowMo: 0,
        //dumpio: true,
        timeout: 60000,        
    });
    
    async function visit(card, name) {
        const page = await browser.newPage();
        console.log(card + ' ' + name)
        var url = 'https://www.clcn.net.cn/user/auth/login'
        //url="https://www.clcn.net.cn/"
        await page.goto(url, {
            //waitUntil: 'load', // Remove the timeout
            waitUntil: 'domcontentloaded',
            timeout: 10000
        });

        await page.waitForXPath("/html/body/div/div[2]/div/form/div[1]/input")
        const inputElement = await page.$('input[name="LoginForm[username]"]');
        await inputElement.type(card, { delay: 10 });

        await page.waitForXPath("/html/body/div/div[2]/div/form/div[2]/div/input")
        await page.type("input[id=loginform-password]", '123456', { delay: 10 })//密码

        let loginSuccess = false;
        loginSuccess = await handleCaptcha(page)

        //console.log('loginSuccess:', loginSuccess)
        if (loginSuccess) {

            //console.log('等待查看')
            await page.waitForSelector("#borInfo > div.borrows > div > div:nth-child(1) > ul > button");
            //console.log('点击查看')
            await page.click("#borInfo > div.borrows > div > div:nth-child(1) > ul > button");
            //console.log('等待table')
            await page.waitForXPath("/html/body/div/div[2]/div/div[3]/div[2]/div/table")
            //显示
            await showTable(page)
        }

        const exit_selector='#container > div.container.user > div > div.userinfo > div > div.col-lg-4.col-md-4.col-sm-4.col-xs-12.userinfo-btn > form > button'
        await page.click(exit_selector);
        
        await page.close()
    }

    var names = new Map()
    // 添加数据到Map中
    names.set("88888", "name0");
    names.set("9999", "name1");
    // 遍历Map中的数据
    console.log()
    for (let [card, name] of names) {
        await visit(card, name);
    }

    
    await browser.close();

})().catch(err => {
    console.error(err);
    process.exit(1);
});

https://datacadamia.com/web/dom/innerhtml

首都图书馆的老接口:

const puppeteer = require('puppeteer');
const fs = require('fs');
const Tesseract = require('tesseract.js');

//browser path: "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe"
//              "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe" 
const chrome_path = "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe"

async function showTable(page) {

    const myprofile = '#exlidMyAccount > a'
    await page.waitForSelector(myprofile);
    //console.log('点击查看')
    await page.click(myprofile);
    const result_table = '#LoansTable' //
    await page.waitForSelector(result_table);

    const table_fullpath = '#LoansTable > tbody > tr';
    await page.waitForFunction('document.querySelector("#LoansTable > tbody > tr") !== null');


    const data = await page.$$eval(table_fullpath, rows => {
        /**====== 重要 =============== */
        //这里的 console.log 打印不出来。断点也无法断住!!!!!!!!
        const rows0 = document.querySelectorAll("#LoansTable > tbody > tr");
        console.log(rows0.length); // 打印行的数量

        //no borrow
        const noDataElement = document.querySelector('#w0 table.table tbody tr td div.empty');

        if (noDataElement) {
            console.log("=== You don't borrow the book ===");
            return ["=== You don't borrow the book ==="]; // Return an empty array to indicate no data.
        }

        //get th table header
        const headerRow = document.querySelector('#LoansTable > thead > tr');
        if (!headerRow) {
            console.log('Table header not found.');
            return [headerRow];
        }

        const headerCells = Array.from(headerRow.querySelectorAll('th'));
        if (headerCells.length === 0) {
            console.error('No header cells found.');
            return ['No header cells found.'];
        }

        return rows.map(row => {
            const properties = {};
            const fourthTd = row.querySelector('[id^="titleSTL"]');
            const pos = row.querySelector('[id^="locationSTL"]')
            const date = row.querySelector('[id^="dueDateSTL"]');

            console.error('No header cells found.', headerCells);
            properties[headerCells[2].innerText.trim()] = fourthTd ? fourthTd.innerText : null;
            properties[headerCells[4].innerText.trim()] = date ? date.innerText : null;
            properties[headerCells[7].innerText.trim()] = pos ? pos.innerText : null;
            return properties;
        });
    });

    await data;

    if (Array.isArray(data) && data.length === 0) {
        console.log("=== No table. You don't borrow the book. ===");
    } else {
        console.log('total books:' + data.length)
        console.log(data);
    }

}

//----------------- main ------------------------------------
var browser;
            var url = 'https://primo.clcn.net.cn/primo_library/libweb/action/loginpage.do?targetURL=https%3a%2f%2fprimo.clcn.net.cn%2fprimo_library%2flibweb%2faction%2fsearch.do%3fvid%3dST%26amp%3bdscnt%3d0%26amp%3bdstmp%3d1705632515141%26amp%3binitializeIndex%3dtrue&isMobile=false'
            //         https://www.clcn.net.cn/user/my/index
            //url="https://www.clcn.net.cn/"

(async () => {
    browser = await puppeteer.launch({
        executablePath: chrome_path,
        //process.env.CHROME_PATH, // Path to chromium build with autofill domain
        headless: false,
        args: ['--no-sandbox', '--disable-setuid-sandbox'],//'--start-maximized',
        defaultViewport: { width: 1800, height: 1000 },
        slowMo: 0,
        //dumpio: true,
        timeout: 60000,
    });

    async function visit(card, name) {
        const page = await browser.newPage();
        try {
            console.log(card + ' ' + name)
         
            await page.goto(url, {
                //waitUntil: 'load', // Remove the timeout
                waitUntil: 'domcontentloaded',
                timeout: 10000
            });

            //登录
            await page.waitForXPath("/html/body/div[2]/div[2]/form/md-card/md-card-content/md-input-container[1]/input")
            //user
            const inputElement = await page.$('input[name="username"]');
            await inputElement.type(card, { delay: 10 });
            //pswd
            //await page.waitForXPath("/html/body/div[2]/div[2]/form/md-card/md-card-content/md-input-container[2]/input")
            await page.type("input[id=input_1]", '888888', { delay: 10 })
            // await page.waitForSelector("body > div.EXLPRMLoginCard.layout-align-center-start.layout-row > div.EXLPRMLoginColumn.layout-column.flex-xs-100.flex-sm-100.flex-25 > form > md-card > div.md-actions.layout-align-end-center.layout-row > a.EXLPRMLoginButtonSubmit.md-button.md-ink-ripple");
            //click to login
            await page.click("body > div.EXLPRMLoginCard.layout-align-center-start.layout-row > div.EXLPRMLoginColumn.layout-column.flex-xs-100.flex-sm-100.flex-25 > form > md-card > div.md-actions.layout-align-end-center.layout-row > a.EXLPRMLoginButtonSubmit.md-button.md-ink-ripple");
            //console.log('等待table')
            await page.waitForXPath("/html/body/div[1]/div[2]/div[2]/ul/li[3]/a")

            //显示
            await showTable(page)

            const exit_selector = '#exlidSignOut > a'
            await page.click(exit_selector);
        } catch (error) {
            console.error(error);
        }
        await page.close()
    }

    var names = new Map()
    // 添加数据到Map中

    names.set("000000000000", "zhang san");
    names.set("888888888888", "li si");
    // 遍历Map中的数据
    for (let [card, name] of names) {
        await visit(card, name);
    }

    console.log("https://www.clcn.net.cn/user/my/index")

    await browser.close();

})().catch(err => {
    console.error(err);
}
).finally(err => {

    //console.log("browser.close")
    browser.close();
    process.exit(1);

});

完全替换元素内容:

let htmlFragment = "<p>Replacing the whole body node content with a paragraph</p>";
document.body.innerHTML += htmlFragment

script elements 使用innerHTML 不会执行

DOM - InsertAdjacent 比 appendChild更高级,能执行script

https://datacadamia.com/web/dom/insertadjacent

let bodySibling = document.createElement('script');
bodySibling.text = 'console.log("Hello World !");';
document.body.insertAdjacentElement('beforeend', bodySibling);

插入element

let pSibling = document.createElement('p');
pSibling.innerText = 'A paragraph';
document.body.insertAdjacentElement('afterbegin', pSibling);

插入html

document.body.insertAdjacentHTML('afterend', '<p>Body Sibling HTML</p>');

注入js

https://www.tabnine.com/code/javascript/functions/puppeteer/Page/%2524eval

async function main() {
  const browser = await puppeteer.launch({
    headless: false,
    userDataDir: path.join(process.cwd(), "ChromeSession")
  });
  const page = await browser.newPage();
  await page.goto('https://web.whatsapp.com', {
    waitUntil: 'networkidle0',
    timeout: 0
  });

  await page.waitForSelector('*[data-icon=chat]',
  {
    polling: 1000,
    timeout: 0
  })

  console.log("Logged in!")
  var filepath = path.join(__dirname, "WAPI.js");
  await page.addScriptTag({ path: require.resolve(filepath) });

  filepath = path.join(__dirname, "inject.js");
  await page.addScriptTag({path: require.resolve(filepath)});

  //await browser.close();
}

获取元素用 page.$ or evaluate

https://helloworldmaster.com/article/get-a-dom-element-using-puppeteer

posted @ 2022-04-11 18:11  Bigben  阅读(198)  评论(0)    收藏  举报