验证码识别与模拟登录
一、验证码识别
验证码识别是基于线上的打码平台识别验证码
-打码平台:
1.超级鹰(http://www.chaojiying.com/)
-注册(用户中心身份)
-登录(用户中心身份)
-1.查询余额,请充值
-2.创建一个软件ID(899370)
-3.下载示例代码
2.云打码
3.打码兔
示例代码
#!/usr/bin/env python # coding:utf-8 import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def PostPic_base64(self, base64_str, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, 'file_base64':base64_str } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json() #封装一个执行方法 def tranformImageCode(imagePath,imageType): chaojiying = Chaojiying_Client('用户账号', '用户密码, '949287') #用户中心>>软件ID 生成一个替换 96001 im = open(imagePath, 'rb').read() return (chaojiying.PostPic(im, imageType))["pic_str"] print(tranformImageCode('D:\爬虫项目\Chaojiying_Python\chaojiying_Python/a.jpg',1902))
二、模拟登录
流程:
-对点击登录按钮发送post请求
-处理请求参数
-用户名
-密码
-验证码
-其他防伪参数
#识别验证码 session = requests.Session() url = "https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx" headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"} page_text = session.get(url=url,headers=headers).text # 解析验证码图片地址 tree= etree.HTML(page_text) img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0] # 将验证码图片保存到本地 img_data = session.get(img_src,headers=headers).content with open('./code.jpg','wb') as fp: fp.write(img_data) # 识别验证码 code_text = tranformImageCode('./code.jpg',1902) print(code_text) login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx' # 发送post请求的url data = { "__VIEWSTATE": "npWNBT+tK2+N2DEleq19LTIIVsGC6RS0c4MT3S4qamIevyKoXBYgMUhuepzlZDtNDNiRaUW8eAF1rCN8XxmDuPXzRg8bsy7bOx0evDPWd1xGMWVga4lsQZu+YPeGc64TLcSF/tFeNNZQBPFj2poVjNm/9hs=", "__VIEWSTATEGENERATOR": "C93BE1AE", "from": "http://so.gushiwen.cn/user/collect.aspx", "email": "18317992874", "pwd": "XY.563628832", "code": code_text, "denglu": "登录", } # 点击登录按钮发送post请求 page_text_login = session.post(url=login_url,headers=headers,data=data).text with open('./gushiwen.html','w',encoding='utf-8')as fp: fp.write(page_text_login)