爬虫学习

爬虫

定义爬取目标(URL)

在浏览器网络部分能让我们看见真正需要的url

image-20260118144033035

#定义url
    url = f"https://www.mashangpa.com/api/problem-detail/1/data/?page={i}"

但大多数情况我们需要携带请求头

#声明请求头----带cookie
headers = {
    "accept": "*/*",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "priority": "u=1, i",
    "referer": "https://www.mashangpa.com/problem-detail/1/",
    "sec-ch-ua": "\"Microsoft Edge\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0"
}
cookies = {
    "sessionid": "766fuslyt5ngd3nr6sfyluiili3hmfxn",
    "Hm_lvt_0d2227abf9548feda3b9cb6fddee26c0": "1768642776,1768713959",
    "HMACCOUNT": "48E38AF4922BDFCB",
    "Hm_lpvt_0d2227abf9548feda3b9cb6fddee26c0": "1768714024"
}

这一步可以让工具帮我们

https://spidertools.cn/#/curl2Request

image-20260118144251485

复制bash请求放入工具里面即可拿到请求头格式

接下来就是发送请求了

#发起get请求
    response = requests.get(url, headers=headers, cookies=cookies)
    sum_data += sum(response.json()["current_array"])

码上爬第一题exp

import requests

#声明请求头----带cookie
headers = {
    "accept": "*/*",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "priority": "u=1, i",
    "referer": "https://www.mashangpa.com/problem-detail/1/",
    "sec-ch-ua": "\"Microsoft Edge\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0"
}
cookies = {
    "sessionid": "766fuslyt5ngd3nr6sfyluiili3hmfxn",
    "Hm_lvt_0d2227abf9548feda3b9cb6fddee26c0": "1768642776,1768713959",
    "HMACCOUNT": "48E38AF4922BDFCB",
    "Hm_lpvt_0d2227abf9548feda3b9cb6fddee26c0": "1768714024"
}

#加和变量
sum_data = 0

for i in range (1, 21):
#定义url
    url = f"https://www.mashangpa.com/api/problem-detail/1/data/?page={i}"
#发起get请求
    response = requests.get(url, headers=headers, cookies=cookies)
    sum_data += sum(response.json()["current_array"])
#获取数据
    print(sum_data)

第二题一样的

浏览器调试被禁用(无限debugger) lv3

image-20260118145148680

打开开发者模式发现被禁用,观察该文件明白是无限debugger

(function anonymous(
) {
debugger
})

现在我们需要解决他

image-20260118145449996

法一 一律不暂停

右键代码所在行数3,选择不暂停

image-20260118145605188

然后刷新发现没有成功,依旧空白,尝试法2

法二 本地替换js文件

image-20260118150006564

我们跟进调用堆栈,看见了该debugger代码

image-20260118150047600

而这块代码让我们触发反爬后进入空白页面、

window.location.href = "about:blank";
!function () {
    if (window.outerWidth - window.innerWidth > 210 || window.outerHeight - window.innerHeight > 210) {
        document.getElementsByTagName("body")[0].innerHTML = '检测到非法调试, 请关闭调试终端后刷新本页面重试!<br/>Welcome for People, Not Welcome for Machine!<br/>';
        window.location.href = "about:blank";
    }

那我们直接把反爬代码删掉就行了,我们要修改本地js文件

edge浏览器的替换方式

image-20260118150850371

谷歌浏览器直接右键js代码,会有替换键

image-20260118150914951

然后替换自己想要的一个文件夹,我这里创建了一个新文件夹叫js文件替换,然后右键要替换的文件,选择修改

image-20260118151310966

这时候就进入了本地修改,有紫色标识,现在进行注释,并保存

image-20260118151556177

刷新界面,成功越过

image-20260118151638265

之后就是爬取文件和数字了

md5加密逆向分析 lv4

image-20260118152249549

提示是sign加密了,我们查找sign跟进发现了这个文件

image-20260118152519799

function loadPage(pageNumber) {
    const timestamp = new Date().getTime()
    window.token = window.md5("tuling" + timestamp + pageNumber)
    const params = {
        page: pageNumber,
        sign: window.token,
        _ts: timestamp,
    };

对sign打上断点,点击第二页查看我们找对没

image-20260118152709011

image-20260118152820758

md5值对上了,证明这段加密我们没找错

window.token = window.md5("tuling" + timestamp + pageNumber)

第一个值是加盐值,第二个值是通过时间来的

const timestamp = new Date().getTime()

至此我们的加密逻辑出来了,我们自己组装一下

function loadPage(pageNumber) {
    const timestamp = new Date().getTime()
    const sign = window.md5("tuling" + timestamp + pageNumber)
    console.log(sign)
}

loadPage(1)

现在有个问题,window里面没有md5,所以我们还需要去找他写的加密逻辑

window = this, function (n) {
    function r(n, r) {
        var t = (65535 & n) + (65535 & r);
        return (n >> 16) + (r >> 16) + (t >> 16) << 16 | 65535 & t
    }

    function t(n, t, o, u, e, c) {
        return r(function (n, r) {
            return n << e | n >>> 32 - e
        }(r(r(t, n), r(u, c))), o)
    }

    function o(n, r, o, u, e, c, f) {
        return t(r & o | ~r & u, n, r, e, c, f)
    }

    function u(n, r, o, u, e, c, f) {
        return t(r & u | o & ~u, n, r, e, c, f)
    }

    function e(n, r, o, u, e, c, f) {
        return t(r ^ o ^ u, n, r, e, c, f)
    }

    function c(n, r, o, u, e, c, f) {
        return t(o ^ (r | ~u), n, r, e, c, f)
    }

    function f(n, t) {
        var f, i, a, h, g;
        n[t >> 5] |= 128 << t % 32, n[14 + (t + 64 >>> 9 << 4)] = t;
        var l = 1732584193, d = -271733879, v = -1732584194, C = 271733878;
        for (f = 0; f < n.length; f += 16) d = c(d = c(d = c(d = c(d = e(d = e(d = e(d = e(d = u(d = u(d = u(d = u(d = o(d = o(d = o(d = o(a = d, v = o(h = v, C = o(g = C, l = o(i = l, d, v, C, n[f], 7, -680876936), d, v, n[f + 1], 12, -389564586), l, d, n[f + 2], 17, 606105819), C, l, n[f + 3], 22, -1044525330), v = o(v, C = o(C, l = o(l, d, v, C, n[f + 4], 7, -176418897), d, v, n[f + 5], 12, 1200080426), l, d, n[f + 6], 17, -1473231341), C, l, n[f + 7], 22, -45705983), v = o(v, C = o(C, l = o(l, d, v, C, n[f + 8], 7, 1770035416), d, v, n[f + 9], 12, -1958414417), l, d, n[f + 10], 17, -42063), C, l, n[f + 11], 22, -1990404162), v = o(v, C = o(C, l = o(l, d, v, C, n[f + 12], 7, 1804603682), d, v, n[f + 13], 12, -40341101), l, d, n[f + 14], 17, -1502002290), C, l, n[f + 15], 22, 1236535329), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 1], 5, -165796510), d, v, n[f + 6], 9, -1069501632), l, d, n[f + 11], 14, 643717713), C, l, n[f], 20, -373897302), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 5], 5, -701558691), d, v, n[f + 10], 9, 38016083), l, d, n[f + 15], 14, -660478335), C, l, n[f + 4], 20, -405537848), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 9], 5, 568446438), d, v, n[f + 14], 9, -1019803690), l, d, n[f + 3], 14, -187363961), C, l, n[f + 8], 20, 1163531501), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 13], 5, -1444681467), d, v, n[f + 2], 9, -51403784), l, d, n[f + 7], 14, 1735328473), C, l, n[f + 12], 20, -1926607734), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 5], 4, -378558), d, v, n[f + 8], 11, -2022574463), l, d, n[f + 11], 16, 1839030562), C, l, n[f + 14], 23, -35309556), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 1], 4, -1530992060), d, v, n[f + 4], 11, 1272893353), l, d, n[f + 7], 16, -155497632), C, l, n[f + 10], 23, -1094730640), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 13], 4, 681279174), d, v, n[f], 11, -358537222), l, d, n[f + 3], 16, -722521979), C, l, n[f + 6], 23, 76029189), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 9], 4, -640364487), d, v, n[f + 12], 11, -421815835), l, d, n[f + 15], 16, 530742520), C, l, n[f + 2], 23, -995338651), v = c(v, C = c(C, l = c(l, d, v, C, n[f], 6, -198630844), d, v, n[f + 7], 10, 1126891415), l, d, n[f + 14], 15, -1416354905), C, l, n[f + 5], 21, -57434055), v = c(v, C = c(C, l = c(l, d, v, C, n[f + 12], 6, 1700485571), d, v, n[f + 3], 10, -1894986606), l, d, n[f + 10], 15, -1051523), C, l, n[f + 1], 21, -2054922799), v = c(v, C = c(C, l = c(l, d, v, C, n[f + 8], 6, 1873313359), d, v, n[f + 15], 10, -30611744), l, d, n[f + 6], 15, -1560198380), C, l, n[f + 13], 21, 1309151649), v = c(v, C = c(C, l = c(l, d, v, C, n[f + 4], 6, -145523070), d, v, n[f + 11], 10, -1120210379), l, d, n[f + 2], 15, 718787259), C, l, n[f + 9], 21, -343485551), l = r(l, i), d = r(d, a), v = r(v, h), C = r(C, g);
        return [l, d, v, C]
    }

    function i(n) {
        var r, t = "", o = 32 * n.length;
        for (r = 0; r < o; r += 8) t += String.fromCharCode(n[r >> 5] >>> r % 32 & 255);
        return t
    }

    function a(n) {
        var r, t = [];
        for (t[(n.length >> 2) - 1] = void 0, r = 0; r < t.length; r += 1) t[r] = 0;
        var o = 8 * n.length;
        for (r = 0; r < o; r += 8) t[r >> 5] |= (255 & n.charCodeAt(r / 8)) << r % 32;
        return t
    }

    function h(n) {
        var r, t, o = "0123456789abcdef", u = "";
        for (t = 0; t < n.length; t += 1) r = n.charCodeAt(t), u += o.charAt(r >>> 4 & 15) + o.charAt(15 & r);
        return u
    }

    function g(n) {
        return unescape(encodeURIComponent(n))
    }

    function l(n) {
        return function (n) {
            return i(f(a(n), 8 * n.length))
        }(g(n))
    }

    function d(n, r) {
        return function (n, r) {
            var t, o, u = a(n), e = [], c = [];
            for (e[15] = c[15] = void 0, 16 < u.length && (u = f(u, 8 * n.length)), t = 0; t < 16; t += 1) e[t] = 909522486 ^ u[t], c[t] = 1549556828 ^ u[t];
            return o = f(e.concat(a(r)), 512 + 8 * r.length), i(f(c.concat(o), 640))
        }(g(n), g(r))
    }

    window.md5 = function (n, r, t) {
        return r ? t ? d(r, n) : function (n, r) {
            return h(d(n, r))
        }(r, n) : t ? l(n) : function (n) {
            return h(l(n))
        }(n)
    }
}();

我腰眼牌,牌没有问题

image-20260118160316669

至此加密逆向完成

然后让python拿到我们的js解密的内容,带入访问然后爬虫就行了

#读取demo.js文件
with open("demo.js", "r", encoding="utf-8") as f:
    js_code = f.read()
#把一段 JavaScript 代码字符串 进行编译 / 预处理 / 加载,生成一个「JS 执行环境对象」,赋值给变量 ctx
ctx = execjs.compile(js_code)

**********

for i in range(1, 21):
    json_data =ctx.call("loadPage", i,)
#call函数,在 Python 代码中,调用「你编译好的 JS 代码 (js_code)」中定义的【JavaScript 函数】
    params = {
        "page": i,
        "sign": json_data["sign"],
        "_ts": json_data["ts"],
    }

贴源码

demo.js

window = this, function (n) {
    function r(n, r) {
        var t = (65535 & n) + (65535 & r);
        return (n >> 16) + (r >> 16) + (t >> 16) << 16 | 65535 & t
    }

    function t(n, t, o, u, e, c) {
        return r(function (n, r) {
            return n << e | n >>> 32 - e
        }(r(r(t, n), r(u, c))), o)
    }

    function o(n, r, o, u, e, c, f) {
        return t(r & o | ~r & u, n, r, e, c, f)
    }

    function u(n, r, o, u, e, c, f) {
        return t(r & u | o & ~u, n, r, e, c, f)
    }

    function e(n, r, o, u, e, c, f) {
        return t(r ^ o ^ u, n, r, e, c, f)
    }

    function c(n, r, o, u, e, c, f) {
        return t(o ^ (r | ~u), n, r, e, c, f)
    }

    function f(n, t) {
        var f, i, a, h, g;
        n[t >> 5] |= 128 << t % 32, n[14 + (t + 64 >>> 9 << 4)] = t;
        var l = 1732584193, d = -271733879, v = -1732584194, C = 271733878;
        for (f = 0; f < n.length; f += 16) d = c(d = c(d = c(d = c(d = e(d = e(d = e(d = e(d = u(d = u(d = u(d = u(d = o(d = o(d = o(d = o(a = d, v = o(h = v, C = o(g = C, l = o(i = l, d, v, C, n[f], 7, -680876936), d, v, n[f + 1], 12, -389564586), l, d, n[f + 2], 17, 606105819), C, l, n[f + 3], 22, -1044525330), v = o(v, C = o(C, l = o(l, d, v, C, n[f + 4], 7, -176418897), d, v, n[f + 5], 12, 1200080426), l, d, n[f + 6], 17, -1473231341), C, l, n[f + 7], 22, -45705983), v = o(v, C = o(C, l = o(l, d, v, C, n[f + 8], 7, 1770035416), d, v, n[f + 9], 12, -1958414417), l, d, n[f + 10], 17, -42063), C, l, n[f + 11], 22, -1990404162), v = o(v, C = o(C, l = o(l, d, v, C, n[f + 12], 7, 1804603682), d, v, n[f + 13], 12, -40341101), l, d, n[f + 14], 17, -1502002290), C, l, n[f + 15], 22, 1236535329), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 1], 5, -165796510), d, v, n[f + 6], 9, -1069501632), l, d, n[f + 11], 14, 643717713), C, l, n[f], 20, -373897302), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 5], 5, -701558691), d, v, n[f + 10], 9, 38016083), l, d, n[f + 15], 14, -660478335), C, l, n[f + 4], 20, -405537848), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 9], 5, 568446438), d, v, n[f + 14], 9, -1019803690), l, d, n[f + 3], 14, -187363961), C, l, n[f + 8], 20, 1163531501), v = u(v, C = u(C, l = u(l, d, v, C, n[f + 13], 5, -1444681467), d, v, n[f + 2], 9, -51403784), l, d, n[f + 7], 14, 1735328473), C, l, n[f + 12], 20, -1926607734), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 5], 4, -378558), d, v, n[f + 8], 11, -2022574463), l, d, n[f + 11], 16, 1839030562), C, l, n[f + 14], 23, -35309556), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 1], 4, -1530992060), d, v, n[f + 4], 11, 1272893353), l, d, n[f + 7], 16, -155497632), C, l, n[f + 10], 23, -1094730640), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 13], 4, 681279174), d, v, n[f], 11, -358537222), l, d, n[f + 3], 16, -722521979), C, l, n[f + 6], 23, 76029189), v = e(v, C = e(C, l = e(l, d, v, C, n[f + 9], 4, -640364487), d, v, n[f + 12], 11, -421815835), l, d, n[f + 15], 16, 530742520), C, l, n[f + 2], 23, -995338651), v = c(v, C = c(C, l = c(l, d, v, C, n[f], 6, -198630844), d, v, n[f + 7], 10, 1126891415), l, d, n[f + 14], 15, -1416354905), C, l, n[f + 5], 21, -57434055), v = c(v, C = c(C, l = c(l, d, v, C, n[f + 12], 6, 1700485571), d, v, n[f + 3], 10, -1894986606), l, d, n[f + 10], 15, -1051523), C, l, n[f + 1], 21, -2054922799), v = c(v, C = c(C, l = c(l, d, v, C, n[f + 8], 6, 1873313359), d, v, n[f + 15], 10, -30611744), l, d, n[f + 6], 15, -1560198380), C, l, n[f + 13], 21, 1309151649), v = c(v, C = c(C, l = c(l, d, v, C, n[f + 4], 6, -145523070), d, v, n[f + 11], 10, -1120210379), l, d, n[f + 2], 15, 718787259), C, l, n[f + 9], 21, -343485551), l = r(l, i), d = r(d, a), v = r(v, h), C = r(C, g);
        return [l, d, v, C]
    }

    function i(n) {
        var r, t = "", o = 32 * n.length;
        for (r = 0; r < o; r += 8) t += String.fromCharCode(n[r >> 5] >>> r % 32 & 255);
        return t
    }

    function a(n) {
        var r, t = [];
        for (t[(n.length >> 2) - 1] = void 0, r = 0; r < t.length; r += 1) t[r] = 0;
        var o = 8 * n.length;
        for (r = 0; r < o; r += 8) t[r >> 5] |= (255 & n.charCodeAt(r / 8)) << r % 32;
        return t
    }

    function h(n) {
        var r, t, o = "0123456789abcdef", u = "";
        for (t = 0; t < n.length; t += 1) r = n.charCodeAt(t), u += o.charAt(r >>> 4 & 15) + o.charAt(15 & r);
        return u
    }

    function g(n) {
        return unescape(encodeURIComponent(n))
    }

    function l(n) {
        return function (n) {
            return i(f(a(n), 8 * n.length))
        }(g(n))
    }

    function d(n, r) {
        return function (n, r) {
            var t, o, u = a(n), e = [], c = [];
            for (e[15] = c[15] = void 0, 16 < u.length && (u = f(u, 8 * n.length)), t = 0; t < 16; t += 1) e[t] = 909522486 ^ u[t], c[t] = 1549556828 ^ u[t];
            return o = f(e.concat(a(r)), 512 + 8 * r.length), i(f(c.concat(o), 640))
        }(g(n), g(r))
    }

    window.md5 = function (n, r, t) {
        return r ? t ? d(r, n) : function (n, r) {
            return h(d(n, r))
        }(r, n) : t ? l(n) : function (n) {
            return h(l(n))
        }(n)
    }
}();

function loadPage(pageNumber) {
    const timestamp = new Date().getTime()
    const sign = window.md5("tuling" + timestamp + pageNumber)
    return {
        "sign" : sign,
        "ts" : timestamp
    }
}

lv4.py

import requests
import execjs

#读取demo.js文件
with open("demo.js", "r", encoding="utf-8") as f:
    js_code = f.read()
#把一段 JavaScript 代码字符串 进行编译 / 预处理 / 加载,生成一个「JS 执行环境对象」,赋值给变量 ctx
ctx = execjs.compile(js_code)
headers = {
    "accept": "*/*",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "cache-control": "no-cache",
    "pragma": "no-cache",
    "priority": "u=1, i",
    "referer": "https://www.mashangpa.com/problem-detail/4/",
    "sec-ch-ua": "\"Microsoft Edge\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0"
}
cookies = {
    "sessionid": "766fuslyt5ngd3nr6sfyluiili3hmfxn",
    "Hm_lvt_0d2227abf9548feda3b9cb6fddee26c0": "1768642776,1768713959",
    "HMACCOUNT": "48E38AF4922BDFCB",
    "Hm_lpvt_0d2227abf9548feda3b9cb6fddee26c0": "1768720771"
}
url = "https://www.mashangpa.com/api/problem-detail/4/data/"

sum_data = 0

#循环遍历页码+加密值
for i in range(1, 21):
    json_data =ctx.call("loadPage", i,)

    params = {
        "page": i,
        "sign": json_data["sign"],
        "_ts": json_data["ts"],
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)
    sum_data += sum(response.json()["current_array"])
print(sum_data)

posted @ 2026-01-18 16:30  paaai  阅读(1)  评论(0)    收藏  举报