爬取今日头条文章
教程仅供技术研究学习使用,若有侵权,联系本人删除
以 https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418 为例
1: 破解as、cp
使用万能的 alt+F

将js代码改写为python代码
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author: zhibo.wang # E-mail: gm.zhibo.wang@gmail.com # Date : 20/07/06 11:36:11 # Desc : """ https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : "" """ import hashlib import math import re import time def get_as_cp(): """ as cp js生成规则 https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_ae91792.js function(i) { var e = {}; e.getHoney = function() { var i = Math.floor((new Date).getTime() / 1e3), e = i.toString(16).toUpperCase(), t = md5(i).toString().toUpperCase(); if (8 != e.length) return { as: "479BB4B7254C150", cp: "7E0AC8874BB0985" }; for (var o = t.slice(0, 5), n = t.slice( - 5), a = "", s = 0; 5 > s; s++) a += o[s] + e[s]; for (var r = "", c = 0; 5 > c; c++) r += e[c + 3] + n[c]; return { as: "A1" + a + e.slice( - 3), cp: e.slice(0, 3) + r + "E1" } }, i.ascp = e } """ t = int(math.floor(time.time())) e = hex(t).upper()[2:] m = hashlib.md5() m.update(str(t).encode(encoding='utf-8')) i = m.hexdigest().upper() if len(e) != 8: AS = '479BB4B7254C150' CP = '7E0AC8874BB0985' return AS, CP n = i[0:5] a = i[-5:] s = '' r = '' for o in range(5): s += n[o] + e[o] r += e[o + 3] + a[o] AS = 'A1' + s + e[-3:] CP = e[0:3] + r + 'E1' return AS, CP
max_behot_time 参数 第一页给 0 后面多页 请给请求后返回数据中的 max_behot_time 值

_signature参数 也是最难处理的
全局搜索,打断点找到 生成的文件

https://s3.pstatp.com/toutiao/resource/ntoutiao_web/static/js/common/lib_6b19209.js i = window.byted_acrawler && window.byted_acrawler.sign ? window.byted_acrawler.sign(o) : "";

跟着断点一直走发现最终生成的文件是 VM621 也就是下面这张截图

将此文件内容全部拷贝 写入 sign.js中
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
window = global;
baseurl = process.argv[2]
cookies = process.argv[3]
ua = process.argv[4]
var document = dom.window.document;
var params = {
location:{
hash: "#mid=5954781019",
host: "www.toutiao.com",
hostname: "www.toutiao.com",
href: "https://www.toutiao.com/c/user/59672551416/#mid=1566273643580418",
origin: "https://www.toutiao.com",
pathname: "/c/user/59672551416/#mid=1566273643580418",
port: "",
protocol: "https:",
search: "",
},
navigator:{
appCodeName: "Mozilla",
appName: "Netscape",
appVersion: "5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
cookieEnabled: true,
deviceMemory: 8,
doNotTrack: null,
hardwareConcurrency: 4,
language: "zh-CN",
languages: ["zh-CN", "zh"],
maxTouchPoints: 0,
onLine: true,
platform: "Win32",
product: "Gecko",
productSub: "20030107",
userAgent: ua,
vendor: "Google Inc.",
vendorSub: "",
},
"screen":{
availHeight: 1040,
availLeft: 0,
availTop: 0,
availWidth: 1920,
colorDepth: 24,
height: 1080,
pixelDepth: 24,
width: 1920,
}
};
Object.assign(window,params);
function setCookie(name, value, seconds) {
seconds = seconds || 0;
var expires = "";
if (seconds != 0 ) {
var date = new Date();
date.setTime(date.getTime()+(seconds*1000));
expires = "; expires="+date.toGMTString();
}
document.cookie = name+"="+escape(value)+expires+"; path=/";
}
//cookies = "csrftoken=a6f078a275e9f39b0addfb9df37fd890; tt_webid=6856639657595241992; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5;tasessionId=ownu4mas91596435834562; ttcid=1de8f696daab43dc8eb818a02408bd6930; tt_scid=P.PuhA.5OslBUeRVIYUAYFS--vw9l9LTWpc4-b4r7prsBwQ2X6extVf1PCjkhCNWc102"
for(let cookie of cookies.split(";")){
tmp = cookie.split("=");
setCookie(tmp[0],tmp[1],1800);
}
window.document = document;
//将拷贝的内容放在这里
window.byted_acrawler && window.byted_acrawler.init({
aid: 24,
dfp: true,
})
//sign = window.byted_acrawler.sign({url:"https://www.toutiao.com/api/pc/media_hot/?media_id=1566273643580418&user_id=59672551416"});
sign = window.byted_acrawler.sign({url:baseurl});
console.log(sign);
请安装jsdom npm i -g jsdom
#!/usr/bin/env python # -*- coding:utf-8 -*- import os import time import math import hashlib import requests def getHoney(): i = math.floor(time.time()) e = str('%X' % i) md5 = hashlib.md5() md5.update(str(i).encode('utf-8')) t = str(md5.hexdigest()).upper() if 8 != len(e): return { 'as':"479BB4B7254C150", 'cp':"7E0AC8874BB0985" } o = t[0:5] n = t[-5:] a = '' r = '' for i in range(5): a += o[i] + e[i] r += e[i + 3] + n[i] return { 'as':"A1" + a + e[-3:], 'cp':e[0:3] + r + "E1" } def get_signature(url, cookies, ua): sign = os.popen('node sign.js {url} {cookies} {ua}'.format( url=url, cookies=cookies, ua=ua) ).read() return "&_signature=" + sign if __name__ == '__main__': headers = { 'Referer':'https://www.toutiao.com/', 'authority': 'www.toutiao.com', 'method': 'GET', 'path': '/c/user/59672551416/', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'cookie': 'csrftoken=a6f078a275e9f39b0addfb9df37fd890; s_v_web_id=verify_kde4odjw_isOUm41W_VbRS_4WS0_BrtZ_Ch1KLo5pkNV5; ttcid=1de8f696daab43dc8eb818a02408bd6930; SLARDAR_WEB_ID=c7f55d5c-4dba-493d-a126-ce8e36b472bf; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6856656984092460551; tt_webid=6856656984092460551; __tasessionId=61hz1rirw1596442527425; tt_scid=UD3a5jP-6nL7yUaAawB2lLtCdtv430T-TJyynultVAGY6J4cY6KXTiH1QRWAYhb9e1f5', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } base_url = 'https://www.toutiao.com/toutiao' param = '/c/user/article/?page_type=1&user_id=59672551416&max_behot_time=0&count=20&as={as}&cp={cp}'.format(**getHoney()) base_url += param signature = get_signature( base_url, headers["cookie"], headers["user-agent"] ) path = param + signature headers['path'] = path url = base_url + signature print(url) response = requests.get(url=url,headers=headers) print(response.text)
python test.py


浙公网安备 33010602011771号