爬虫终阶上

1.执行JS代码

假如在逆向分析时,发现某个js加密算法比较繁琐,用Python还原同样的算法比较费劲。此时,可以不必使用Python还原,而是利用Python去直接调用JavaScript中定义的功能。

想实现Python调用JavaScript代码,需如下步骤:

  • 在电脑上安装node.js(软件)
  • 安装Python的第三方模块pyexecjs
  • 利用 pyexecjs 调用 nodejs 去执行JavaScript代码

方式1

function func(arg) {
    return arg + 'i666';
}
var a1 = process.argv[0]
var data = func(a1);
console.log(data)
import subprocess

res = subprocess.check_output('node demo.js "zbb"', shell=True)
data_string = res.decode('utf-8')
print(data_string)

方式2

pip3.11 install pyexecjs
import execjs

js_string = """
function func(arg) {
    return arg + '666';
}
"""
JS = execjs.compile(js_string)

sign = JS.call("func", "zbb")
print(sign)  # 

2.案例xx

import execjs
import requests
import ddddocr
from bs4 import BeautifulSoup
from lxml import etree

# 1.首页请求
# cookie_dict = {}
s=requests.session()
res = s.get(url="https://xuexi.chinabett.com/")
# cookie_dict.update(res.cookies.get_dict())

# 2.获取验证码地址
tree = etree.HTML(res.text)
image_tag2 = tree.xpath('//*[@id="imgVerifity"]/@src')
code_src = image_tag2[0]

# 3.读取验证码并实现
res = s.get(url=f"https://xuexi.chinabett.com{code_src}")
# cookie_dict.update(res.cookies.get_dict())
ocr = ddddocr.DdddOcr(show_ad=False)
code = ocr.classification(res.content)

# 4.处理用户名&密码

js_string = """
function base64encode(str) {
    var base64EncodeChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    var base64DecodeChars = new Array(
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
    -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1);
    var out, i, len;
    var c1, c2, c3;
    len = str.length;
    i = 0;
    out = "";
    while (i < len) {
        c1 = str.charCodeAt(i++) & 0xff;
        if (i == len) {
            out += base64EncodeChars.charAt(c1 >> 2);
            out += base64EncodeChars.charAt((c1 & 0x3) << 4);
            out += "==";
            break;
        }
        c2 = str.charCodeAt(i++);
        if (i == len) {
            out += base64EncodeChars.charAt(c1 >> 2);
            out += base64EncodeChars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
            out += base64EncodeChars.charAt((c2 & 0xF) << 2);
            out += "=";
            break;
        }
        c3 = str.charCodeAt(i++);
        out += base64EncodeChars.charAt(c1 >> 2);
        out += base64EncodeChars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
        out += base64EncodeChars.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));
        out += base64EncodeChars.charAt(c3 & 0x3F);
    }
    return out;
};

function s1() {
    var data = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"];
    var r = Math.floor(Math.random() * 62);
    return data[r];
}

function encryptPwd(password){
    //base64编码的密码每隔1位插入一个随机数 最后一位后面不插入
    var newPwd = [];
    var pwdlength = password.length;
    for (i = 0; i < pwdlength; i++) {
        newPwd.push(password[i]);
        if (i < pwdlength - 1)
            newPwd.push(s1());

    }
    var res = newPwd.join('');
    return res;
}
"""
JS = execjs.compile(js_string)

# 用户名
username = JS.call("base64encode", "21321323")
# 密码
temp = JS.call("base64encode", "123")
password = JS.call("encryptPwd", temp)

# 5.登录
res = s.post(
    url="https://xuexi.chinabett.com/Login/Entry",
    data={
        "userAccount": username,
        "password": password,
        "returnUrl": "/PersonalCenter",
        "proVing": code,
    },
    # cookies=cookie_dict
)
print(res.text)

3.浏览器环境

在使用pyexecjs执行JavaScript代码时,如果存在读取浏览器环境,会失败。例如:

import execjs

js_string = """
function func(arg) {
    return arg + '666' + document.location.hostname + window.navigator.userAgent;
}
"""
JS = execjs.compile(js_string)

sign = JS.call("func", "zzz")
print(sign)  

此时,就需要创造浏览器环境然后再执行JavaScript代码。

npm config set registry https://registry.npmmirror.com
npm install -g jsdom       【主要】
npm install -g node-gyp 
npm install -g canvas

查看安装

npm  root -g 

C:\nodejs\node_global\node_modules  #把这个加入环境变量
import execjs

js_string = """
const jsdom = require("jsdom");
const {JSDOM} = jsdom;

const html = `<!DOCTYPE html><p>Hello world</p>`;
const dom = new JSDOM(html, {
    url: "https://user.qunar.com/passport/login.jsp",
    referrer: "https://www.qunar.com/",
    contentType: "text/html"
});
document = dom.window.document;

window = global;
Object.assign(global, {
    location: {
        hash: "",
        host: "user.qunar.com",
        hostname: "user.qunar.com",
        href: "https://user.qunar.com/passport/login.jsp",
        origin: "https://user.qunar.com",
        pathname: "/passport/login.jsp",
        port: "",
        protocol: "https:",
        search: "",
    },
    navigator: {
        appCodeName: "Mozilla",
        appName: "Netscape",
        appVersion: "5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        cookieEnabled: true,
        deviceMemory: 8,
        doNotTrack: null,
        hardwareConcurrency: 4,
        language: "zh-CN",
        languages: ["zh-CN", "zh"],
        maxTouchPoints: 0,
        onLine: true,
        platform: "MacIntel",
        product: "Gecko",
        productSub: "20030107",
        userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        vendor: "Google Inc.",
        vendorSub: "",
        webdriver: false
    }
});
location = window.location;
XMLHttpRequest = function(){};
function func(arg) {
    var xhr = new XMLHttpRequest();
    return arg + '666' + document.location.hostname + window.navigator.userAgent;
}
"""
JS = execjs.compile(js_string)

sign = JS.call("func", "zzz")
print(sign)

关于XMLHttpRequest

XMLHttpRequest = function () {
    return {
        open:function(){},
        setRequestHeader:function(){},
        send:function(){},
    }
}


function func(arg) {
    var xhr = new XMLHttpRequest();
    xhr.open('POST', "/test/", true);
    xhr.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded; charset-UTF-8');
    xhr.send('n1=1;n2=2;');

    return arg + "666" + location.href + window.navigator.userAgent;
}

4.AES加密

import base64
import binascii
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad

data_string='{"openTime":1710319912672,"startTime":1710319913852,"endTime":1710319914153,"userAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36","uid":"0000ee00306c5d354d3029a9","track":["14010;499.00;553.00;0.00","14031;517.00;558.00;18.00","14056;568.00;564.00;69.00","14079;656.00;566.00;157.00","14102;752.00;566.00;253.00","14126;841.00;566.00;342.00","14150;918.00;566.00;419.00"],"acc":[],"ori":[],"deviceMotion":[{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true}]}'
# key = "227V2xYeHTARSh1R".encode('utf-8')
key_string = "32323756327859654854415253683152"
key = binascii.a2b_hex(key_string)


aes = AES.new(
    key=key,
    mode=AES.MODE_ECB
)
raw = pad(data_string.encode('utf-8'), 16)
aes_bytes = aes.encrypt(raw)
res = base64.b64encode(aes_bytes).decode('utf-8')
print(res)

5.X哪了案例

逆向轨迹snapshot

import json
import random
import time
import base64
import binascii

import requests
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad

##自己生成轨迹或者复制浏览器的注意每个人的浏览器窗口大小都不一样
# def get_slider_list():
#     slider_list = []
#     client_x = 300
#     client_y = 500
#     start_time = int(int(time.time() * 1000) % 1e5)
#     width = random.randint(419, 431)
#     for slice_distance in range(3, width, 26):
#         if width - slice_distance <= 26:
#             slice_distance = width
#         start_time += random.randint(10, 1000)
#         i = start_time
#         o = f"{client_x + slice_distance}.00"
#         u = f"{client_y + random.randint(-5, 5)}.00"
#         a = f"{slice_distance}.00"
#         f = f"{i};{o};{u};{a}"
#         slider_list.append(f)
#     return slider_list
slider_list = ["14010;499.00;553.00;0.00", "14031;517.00;558.00;18.00", "14056;568.00;564.00;69.00",
               "14079;656.00;566.00;157.00", "14102;752.00;566.00;253.00", "14126;841.00;566.00;342.00",
               "14150;918.00;566.00;419.00"]


#1.实现加密
def aes_encrypt(data_string):
    # key = "227V2xYeHTARSh1R".encode('utf-8')
    key_string = "32323756327859654854415253683152"
    key = binascii.a2b_hex(key_string)

    aes = AES.new(
        key=key,
        mode=AES.MODE_ECB
    )
    raw = pad(data_string.encode('utf-8'), 16)
    aes_bytes = aes.encrypt(raw)
    res_string = base64.b64encode(aes_bytes).decode('utf-8')
    return res_string


def run():
    res = requests.get("https://user.qunar.com/passport/login.jsp")
    cookie_dict = res.cookies.get_dict()
    cookie_qn1 = cookie_dict['QN1']

    # slider_list = get_slider_list() #自己生成轨迹时开启
    slider_info = {
        "openTime": int((time.time() - random.randint(500, 3000)) * 1000),
        "startTime": int((time.time() - random.uniform(2, 4)) * 1000),
        "endTime": int((time.time() - random.uniform(0, 1)) * 1000),
        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "uid": cookie_qn1,
        "track": slider_list,
        "acc": [],
        "ori": [],
        "deviceMotion": [{"isTrusted": True} for _ in range(random.randint(10, 100))]
    }

    data_string = json.dumps(slider_info, separators=(',', ':'))
    data = aes_encrypt(data_string)

    r = {
        "appCode": "register_pc",
        "cs": "pc",
        "data": data,
        "orca": 2
    }
    print(r)
    res = requests.post(
        url="https://vercode.qunar.com/inner/captcha/snapshot",
        json=r,
        cookies=cookie_dict
    )
    print(res.text)

if __name__ == '__main__':
    run()

逆向提交sendLoginCode

看到这个 window.Bella 后就应该想到一个开发的潜规则:记载某个js文件,在内部将函数赋值给window,后续其他文件中就可以调用此方法。

html验证

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>

<script src="sdk.js"></script>
<script>
    var res = window.Bella(
        {slideToken: "15cf502c3128593b1a3237e5c484d6c9"},
        {v: 2}
    )
    console.log(res);
</script>
</body>
</html>

补环境报错

还得加上上面的基础环境

XMLHttpRequest = function () {
    return {
        open:function (){},
        send:function (){},
        onreadystatechange :function (){}
    };
};

window.XMLHttpRequest = XMLHttpRequest;

补环境失败一直卡主 直接退出JS

        window['Bella'] = _0x47bb39;
        var _0x6bf389 = Date[_0x5a69('0x324')]();
        var _0x194046 = _0x6bf389 - _0x6ffe8a;
        _0x51d2f1[_0x5a69('0x57')]('quinn_qlogj', _0x194046);
    }
		var bella =window.Bella(
        {slideToken: process.argv[2},
        {v: 2}
    )
		process.exit();

代码整合

import json
import random
import time
import base64
import binascii

import requests
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad


slider_list = ["14010;499.00;553.00;0.00", "14031;517.00;558.00;18.00", "14056;568.00;564.00;69.00",
               "14079;656.00;566.00;157.00", "14102;752.00;566.00;253.00", "14126;841.00;566.00;342.00",
               "14150;918.00;566.00;419.00"]


#1.实现加密
def aes_encrypt(data_string):
    # key = "227V2xYeHTARSh1R".encode('utf-8')
    key_string = "32323756327859654854415253683152"
    key = binascii.a2b_hex(key_string)

    aes = AES.new(
        key=key,
        mode=AES.MODE_ECB
    )
    raw = pad(data_string.encode('utf-8'), 16)
    aes_bytes = aes.encrypt(raw)
    res_string = base64.b64encode(aes_bytes).decode('utf-8')
    return res_string


def run():
    res = requests.get("https://user.qunar.com/passport/login.jsp")
    cookie_dict = res.cookies.get_dict()
    cookie_qn1 = cookie_dict['QN1']

    # slider_list = get_slider_list() #自己生成轨迹时开启
    slider_info = {
        "openTime": int((time.time() - random.randint(500, 3000)) * 1000),
        "startTime": int((time.time() - random.uniform(2, 4)) * 1000),
        "endTime": int((time.time() - random.uniform(0, 1)) * 1000),
        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "uid": cookie_qn1,
        "track": slider_list,
        "acc": [],
        "ori": [],
        "deviceMotion": [{"isTrusted": True} for _ in range(random.randint(10, 100))]
    }

    data_string = json.dumps(slider_info, separators=(',', ':'))
    data = aes_encrypt(data_string)

    r = {
        "appCode": "register_pc",
        "cs": "pc",
        "data": data,
        "orca": 2
    }
    print(r)
    res = requests.post(
        url="https://vercode.qunar.com/inner/captcha/snapshot",
        json=r,
        cookies=cookie_dict
    )
	res_dict = res.json()
    slide_token = res_dict['data']["cst"]
    cookie_dict.update(res.cookies.get_dict())

    import subprocess
    res = subprocess.check_output(f'node v1.js "{slide_token}"', shell=True)
    bella_string = res.decode('utf-8').strip()

    res = requests.post(
        url="https://user.qunar.com/weblogin/sendLoginCode",
        data={
            "usersource": "",
            "source": "",
            "ret": "",
            "ref": "",
            "business": "",
            "pid": "",
            "originChannel": "",
            "activityCode": "",
            "origin": "",
            "mobile": "自己的手机号",
            "prenum": "86",
            "loginSource": "1",
            "slideToken": slide_token,
            "smsType": "0",
            "appcode": "register_pc",
            "bella": bella_string,
            "captchaType": ""
        },
        cookies=cookie_dict
    )
    print(res.text)


if __name__ == '__main__':
    run()

短信登录

def run():
    mobile = input("请输入手机号:")

    res = requests.get("https://user.qunar.com/passport/login.jsp")
    cookie_dict = res.cookies.get_dict()
    cookie_qn1 = cookie_dict['QN1']

    slider_list = get_slider_list()
    slider_info = {
        "openTime": int((time.time() - random.randint(500, 3000)) * 1000),
        "startTime": int((time.time() - random.uniform(2, 4)) * 1000),
        "endTime": int((time.time() - random.uniform(0, 1)) * 1000),
        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "uid": cookie_qn1,
        "track": slider_list,
        "acc": [],
        "ori": [],
        "deviceMotion": [{"isTrusted": True} for _ in range(random.randint(10, 100))]
    }

    data_string = json.dumps(slider_info, separators=(',', ':'))
    data = aes_encrypt(data_string)
    res = requests.post(
        url="https://vercode.qunar.com/inner/captcha/snapshot",
        json={
            "appCode": "register_pc",
            "cs": "pc",
            "data": data,
            "orca": 2
        },
        cookies=cookie_dict
    )
    res_dict = res.json()
    slide_token = res_dict['data']["cst"]
    cookie_dict.update(res.cookies.get_dict())

    import subprocess
    res = subprocess.check_output(f'node v1.js "{slide_token}"', shell=True)
    bella_string = res.decode('utf-8').strip()

    res = requests.post(
        url="https://user.qunar.com/weblogin/sendLoginCode",
        data={
            "usersource": "",
            "source": "",
            "ret": "",
            "ref": "",
            "business": "",
            "pid": "",
            "originChannel": "",
            "activityCode": "",
            "origin": "",
            "mobile": mobile,
            "prenum": "86",
            "loginSource": "1",
            "slideToken": slide_token,
            "smsType": "0",
            "appcode": "register_pc",
            "bella": bella_string,
            "captchaType": ""
        },
        cookies=cookie_dict
    )
    print(res.text)
    cookie_dict.update(res.cookies.get_dict())

    sms_code = input("请输入短信验证码:")
    res = requests.post(
        url="https://user.qunar.com/weblogin/verifyMobileVcode",
        json={
            "piccoloT": "login_register_pc",
            "mobile": mobile,
            "prenum": "86",
            "vcode": sms_code,
            "type": "3",
            "slideToken": slide_token,
            "appcode": "register_pc",
            "loginSource": 1,
            "captchaType": "",
            "source": "",
            "usersource": "",
            "ret": "",
            "ref": "",
            "business": "",
            "pid": "",
            "originChannel": "",
            "activityCode": ""
        }
    )
    cookie_dict.update(res.cookies.get_dict())

    print(res.text)
    print(cookie_dict)

if __name__ == '__main__':
    run()

6.TLS指纹绕过

只存在https

curl_cffi

  • curl是一个可以发送网络请求的工具。
  • curl-impersonate是一个基于curl基础上进行开发的一个工具,可以完美的模拟主流的浏览器。
  • curl_cffi,是套壳curl-impersonate,让此工具可以更方便的应用在Python中。
pip install curl-cffi
from curl_cffi import requests

res = requests.get(
    # url="https://ascii2d.net/",
    # url="https://cn.investing.com/equities/amazon-com-inc-historical-data",
    url="https://match.yuanrenxue.cn/api/match/19?page=1",
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
    },
        impersonate="chrome101",
)
print(res.text)
posted @ 2024-01-30 17:50  追梦nan  阅读(10)  评论(0编辑  收藏  举报