综合爬虫网易解析

import requests
# a=input("aiqnf")
# url = f'https://www.baidu.com/s?wd={a}'
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
# }
# respons = requests.get(url,headers=headers).text
# print(respons)

# url = 'https://fanyi.baidu.com/sug'
# # s=input("请输入翻译的词:")
# # data = {
# # "kw":s,
# # "to":"zh",
# # "from":"en",
# # }
# # respons = requests.post(url,data=data)
# #
# # print(respons.json())

# url = 'https://movie.douban.com/j/chart/top_list'
# # url进行封装
# param={
# "type": "24",
# "interval_id": "100:90",
# "action":"",
# "start": 0,
# "limit": 1,
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
# }
# response = requests.get(url,headers=headers,params=param)
# res=response.text
# print(res)
# response.close() # 关掉程序

# 安全验证 verify=False

# url = 'https://dytt89.com/'
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
# }
# respons = requests.get(url,headers=headers,verify=False).text
# print(respons)

###################################会话机制################

# session = requests.sessions()
# data = {
# "usename":usename,
# "password":pwd
# }
# url='登陆的url'
# session.post(url,data=data)
# session.get('url')

#############################代理#########################
# proxies={
# "http":"http://183.213.26.12:3128"
# }
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
# }
# url="http://www.baidu.com/s?wd=ip&rsv_spt=1&rsv_iqid=0xbc13eded00004f33&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=1&oq=python%2520join%25E7%259A%2584%25E7%2594%25A8%25E6%25B3%2595&rsv_btype=t&inputT=3954310&rsv_t=2b18dAzJzufOxce99zlXS8DG%2Bh%2BPZOlvBFX07CF%2FKxNyO29Cv4x4qmNGhoFQ91fTOikM&rsv_pq=96cde5ec0003247a&rsv_sug3=20&rsv_sug1=11&rsv_sug7=100&rsv_sug2=0&rsv_sug4=3955016"
# url="http://www.baidu.com"
# res=requests.get(url,headers=headers,proxies=proxies,verify=False)
# res.encoding='utf-8'
# pri=res.text
# print(pri)
# with open('ip.html','w',encoding='utf-8')as f:
# f.write(pri)

##########################网易评论抓取##############
from Crypto.Cipher import AES
from Crypto.PublicKey import RSA
import requests
import json
from base64 import b64encode
'''
网易云评论是post请求获取的数据,并且请求的时候需要携带参数params,encSecKey,但是这来个数值加密了
所以需要找到未加密的参数,然后把参数进行参考网易云的逻辑去加密在此请求网易
然后拿到评论
-- 1. 先去 call stack 从下往上执行,
function() {
function a(a) { #随机的16位字符串
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
for (d = 0; a > d; d += 1) # 循环16次
e = Math.random() * b.length, # 随机数
e = Math.floor(e), #取整
c += b.charAt(e); # 取字符串中的某个位置
return c
}
function b(a, b) { a是要解密的内容
var c = CryptoJS.enc.Utf8.parse(b) #秘钥b是由
, d = CryptoJS.enc.Utf8.parse("0102030405060708")
, e = CryptoJS.enc.Utf8.parse(a) #e是数据
, f = CryptoJS.AES.encrypt(e, c, { #缺秘钥 c是由b赋值
iv: d, # iv(偏移量)
mode: CryptoJS.mode.CBC # 加密模式CBC
});
return f.toString()
}
function c(a, b, c) {
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b,"",c),
e = encryptedString(d, a)
}
function d(d, e, f, g) {
var h = {} -- 空对象
, i = a(16); 此处i调用了a函数 i就是一个16位的随机数
return h.encText = b(d, g), # g是秘钥
# 这里是经过数据+g==> b ==>第一次加密+i==>b==>params
h.encText = b(h.encText, i), #返回的就是params i是秘钥

h.encSecKey = c(i, e, f), #得到的是encSecKey 此处e和f是固定的,i是一个随机值
#--- 从js那边拿到i="xEfEXb070X6UeXE4"
h
}
function e(a, b, d, e) {
var f = {};
return f.encText = c(a + e, b, d),
f
}
window.asrsea = d,

data="rid=A_PL_0_19723756&threadId=A_PL_0_19723756&pageNo=1&pageSize=20&cursor=-1&offset=0&orderType=1"
'''
url="https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
# 服务于d的
e="010001"
f="00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
g="0CoJUm6Qyw8W8jud"
i="xEfEXb070X6UeXE4" # 手动固定 ——> js函数中是随机的
encSecKey= "CD68xnHAftYWltHyJ8F+tOigcwe2SW4V/Fr4HhoWAr/jNYyDEOiXEvJ5DlpFa3KrIF9Y+L9O+M8ApAIcMPvo3U1mlaX630yfLknlqgA056xNnYPCOZgUOHIYu1hoSXfFD5k9PlUWTpLQvmMDWgvr/fIXp+0qVWoHtqUETTREntPYgywsRJ6SfZPml7cM30xD"
data={
"csrf_token": "",
"cursor": "-1",
"offset": "0",
"orderType": "1",
"pageNo": "1",
"pageSize": "20",
"rid": "A_PL_0_19723756",
"threadId": "A_PL_0_19723756"
}
def get_enc(): # 由于i是固定的,那么encSecKey就是固定的 就是js中c的函数值
'''
由于i是随机数所以就先把i固定那个值也就是顾定的
:return:
'''
return "CD68xnHAftYWltHyJ8F+tOigcwe2SW4V/Fr4HhoWAr/jNYyDEOiXEvJ5DlpFa3KrIF9Y+L9O+M8ApAIcMPvo3U1mlaX630yfLknlqgA056xNnYPCOZgUOHIYu1hoSXfFD5k9PlUWTpLQvmMDWgvr/fIXp+0qVWoHtqUETTREntPYgywsRJ6SfZPml7cM30xD"
# 把参数进行加密
def get_params(data): # 默认data收到的是字符串
first = enc_params(data,g) # 第一次加密
second = enc_params(first,i) # 第二次加密
return second

# 转换成16的倍数
def to_aes(data):
pad = 16-len(data) % 16
data+=chr(pad)*pad
return data


def enc_params(data,key): #加密
iv = "0102030405060708"
data = to_aes(data)
aes=AES.new(key=key.encode('utf-8'),IV=iv.encode('utf-8'),mode=AES.MODE_CBC)
bs = aes.encrypt(data.encode('utf-8')) # 加密的内容长度必须是16的倍数 缺少的是chr(10)
return str(b64encode(bs),'utf-8')

res=requests.post(url,data={
"params":get_params(json.dumps(data)),
"encSecKey":get_enc()
})
t=res.text
print(t)
posted @ 2021-04-16 13:22  mjth  阅读(208)  评论(0)    收藏  举报