爬虫入门--模拟登录
概述
本文模拟登录,古诗文网,并附带关键代码。
步骤
- 1.获取登录页HTML
- 2.解析验证码url并下载保存
- 3.解析验证码(这里使用超级鹰)
- 4.模拟发起请求
注意事项
- 1.请求改为使用session发送,因为过程中可能会需要携带cookie,session会自动携带cookie
import requests
from lxml import etree
from chaojiying import Chaojiying_Client
if __name__ == '__main__':
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.195 Safari/537.36'
}
session = requests.Session()
page_text = session.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
code_url = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
print(code_url)
code_img = session.get(url=code_url, headers=headers).content
with open('code.jpg', 'wb') as fp:
fp.write(code_img)
codeClient = Chaojiying_Client()
code_text = codeClient.decode(1902, 'code.jpg')
login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
data = {
'__VIEWSTATE': tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0],
'__VIEWSTATEGENERATOR': tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0],
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '******',
'pwd': '******',
'code': code_text,
'denglu': '登录'}
response = session.post(url=login_url, data=data, headers=headers)
user_page_text = response.text
print(data)
# user_page_text.encode('gbk', 'ignore').decode('gbk')
with open('./gushiciwang.html', 'w', encoding='utf-8') as fp:
fp.write(user_page_text)
print(response.status_code)
print(response.text)