python爬虫登录古诗文网,包含验证码识别

话不多说,直接上代码。

这里的验证码识别采用超级鹰接口完成。

import requests
from lxml import etree
from urllib import parse
import os
from js_test.tool.chaojiying import Chaojiying_Client


def login(login_url):
    session = requests.Session()
    res = session.get(url=login_url)
    res.encoding = res.apparent_encoding
    html = etree.HTML(res.text)
    img_url = html.xpath(r"//img[@id='imgCode']/@src")[0]
    img_url = parse.urljoin(res.url, img_url)
    img = session.get(url=img_url)
    if not os.path.exists("./img/"):
        os.makedirs("./img/")
    with open("./img/img_code.png", 'wb') as f:
        f.write(img.content)
    chaojiying = Chaojiying_Client('hehaha', 'h672049506', '920530')
    im = open(r'./img/img_code.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    imgcode = chaojiying.PostPic(im, 1902)['pic_str']  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
    print(imgcode)
    url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
    from fake_useragent import UserAgent  # 获取随机user angent
    ua = UserAgent()
    USER_AGENT = ua.random
    headers = {'User-Agent': USER_AGENT}
    data = {
        '__VIEWSTATE': '2g12Ic1YXmpM7tXeHpKk3SXdmYQcHg1/AVDb9G050iVJOYrEDn0BPykvnnSpGGyFkfXoFq6kZ/Q/0yyWDHGwxKJtRHrq8EFxhfwDDVhVFoiAkEH23bhK8IrqUZk=',
        '__VIEWSTATEGENERATOR': 'C93BE1AE',
        'from': 'http://so.gushiwen.cn/user/collect.aspx',
        'email': '17726465606',
        'pwd': 'h672049506',
        'code': imgcode,
        'denglu': '登录'}
    res = session.post(url=url, data=data, headers=headers)
    print(res.text)


if __name__ == '__main__':
    login("https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx")

 

posted @ 2021-08-03 18:04  何哈哈哈  阅读(404)  评论(0)    收藏  举报