python爬虫登录古诗文网,包含验证码识别
话不多说,直接上代码。
这里的验证码识别采用超级鹰接口完成。
import requests from lxml import etree from urllib import parse import os from js_test.tool.chaojiying import Chaojiying_Client def login(login_url): session = requests.Session() res = session.get(url=login_url) res.encoding = res.apparent_encoding html = etree.HTML(res.text) img_url = html.xpath(r"//img[@id='imgCode']/@src")[0] img_url = parse.urljoin(res.url, img_url) img = session.get(url=img_url) if not os.path.exists("./img/"): os.makedirs("./img/") with open("./img/img_code.png", 'wb') as f: f.write(img.content) chaojiying = Chaojiying_Client('hehaha', 'h672049506', '920530') im = open(r'./img/img_code.png', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要// imgcode = chaojiying.PostPic(im, 1902)['pic_str'] # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加() print(imgcode) url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx' from fake_useragent import UserAgent # 获取随机user angent ua = UserAgent() USER_AGENT = ua.random headers = {'User-Agent': USER_AGENT} data = { '__VIEWSTATE': '2g12Ic1YXmpM7tXeHpKk3SXdmYQcHg1/AVDb9G050iVJOYrEDn0BPykvnnSpGGyFkfXoFq6kZ/Q/0yyWDHGwxKJtRHrq8EFxhfwDDVhVFoiAkEH23bhK8IrqUZk=', '__VIEWSTATEGENERATOR': 'C93BE1AE', 'from': 'http://so.gushiwen.cn/user/collect.aspx', 'email': '17726465606', 'pwd': 'h672049506', 'code': imgcode, 'denglu': '登录'} res = session.post(url=url, data=data, headers=headers) print(res.text) if __name__ == '__main__': login("https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx")

浙公网安备 33010602011771号