scrapy使用六:使用cookies,模拟登录
cookie:指某些网站为了辨别用户身份而存储在用户本地终端上的数据,通常经过加密。
保持登录:当登录一个网站时,网站往往会请求用户名和密码,进行登录;并且用户可以勾选下次自动登录。如果勾选了,那么下次访问同一网站时,用户会发现没有输入用户名和密码就已经登录了。
这正是因为前一次登录时,服务器发送了包含登录凭据(用户名加密码的某种加密形式)的cookie,到用户的硬盘上,保留在浏览器中。
第二次登录时,如果该cookie未到期,浏览器就会发送cookie给服务器,服务器验证凭证,于是不必输入用户名和密码就可以登录了。
cookie模拟登录:
1.打开fiddler --> winconfig,勾选ie,即监控ie
2.打开ie,输入地址,并登录:https://login.weibo.cn/login/或https://passport.weibo.cn/signin/login
3.在fiddler中,点击以上地址,拖动到右边的监控容器,获取到cookie
4.编写代码:使用cookie登录,打印微博内容
from lxml import etree import requests
url = "http://weibo.cn/u/微博id号"
cookie = "刚才复制的cookie" html = requests.get(url, cookies=cookie).content selector = etree.HTML(html) content = selector.xpath("//span[@class='ctt']") for each in content: text = each.xpath('string(.)') print(text)
如果使用 requests.get(url, cookie=cookie).text,获取内容,那么需要编码,并转换为bytes类型
html = requests.get(url, cookies=cookie).text html = bytes(bytearray(html, encoding='utf-8'))
5.模拟登录
登录地址:https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=
输入密码登录,同时查看network,headers,找到form data的信息
观察登录机制:每次登录的时侯password_后面的四个数字,每次都会改变
import requests from lxml import etree url = "http://weibo.cn/u/1890493665" url_login = "https://login.weibo.cn/login/" html = requests.get(url).content selector = etree.HTML(html) password = selector.xpath('//input[@type="password"]/@name')[0] vk = selector.xpath('//input[@name="vk"]/@value')[0] action = selector.xpath('//form[@method="post"]/@action')[0] print (action, password, vk) new_url = url_login + action data = { 'mobile': '', password: 'your password', 'remember': 'on', 'backTitle': u"微博", 'typeCount': '', 'tryCount': '', 'vk': vk, 'submit': u"登录" } # password字段为变量 newhtml = requests.post(new_url, data=data).content new_selector = etree.HTML(html)
示例:
监控微博,一旦有更新,转发到邮箱,微信推送
import smtplib from email.mime.text import MIMEText import requests from lxml import etree import os, sys, time reload(sys) sys.setdefaultencoding('utf-8') class MainHelper(object): def __init__(self): self.mail_host = "smtp.sina.com" # 邮箱服务器 self.mail_user = "your username" self.mail_pass = "your pwd" self.mail_postfix = "sina.com" # 邮箱后缀 def send_mail(self, to_list, sub, content): me = "....helper<" + self.mail_user + "@" + self.mail_postfix + "." msg = MIMEText(content, _subtype='plain', _charset="utf-8") msg['Subject'] = sub msg['From'] = me msg['To'] = ';'.join(to_list) try: server =smtplib.SMTP() server.connect(self.mail_host) server.login(self.mail_user, self.mail_pass) server.sendmail(me, to_list, msg.as_string()) server.close() return True except Exception as e print (str(e)) return False class TargetHelper(object) def __init__(self): self.url = 'htpp://weibo.cn/u/关注的帐号id' self.url_login = 'https://login.weibo.cn/login/' def getSource(self): html = requests.get(self.url).content return html def getData(self, html): selector = etree.HTML(html) password = selector.xpath('//input[@type="password"]/@name')[0] vk = selector.xpath('//input[@name="vk"]/@value')[0] action = selector.xpath('//form[@method="post"]/@action')[0] print(action, password, vk) self.new_url = self.url_login + action data = { 'mobile': '', password: 'your password', 'remember': 'on', 'backTitle': u"微博", 'typeCount': '', 'tryCount': '', 'vk': vk, 'submit': u"登录" } return data def getContent(self, data): newhtml = requests.post(self.new_url, data=data).content new_selector = etree.HTML(newhtml) content = new_selector.xpath('//span[@class="ctt"]') newcontent = unicode(content[2].xpath('string(.)')).replace("http://", '') senttime = new_selector.xpath('//span[@class="ctt"]/text()')[0] sendtext = newcontent + senttime return sendtext def tosave(self, text): f = open('webo.txt', 'a') f.write(text + '\n') f.close() def tocheck(self, data): if not os.path.exists('weibo.txt') return True else: f = open('weibo.txt', 'r') existsweibo = f.readline() if data + '\n' in existsweibo: return False else: return True if __name__ == '__main__': mail_to_list = ['receive_mail@qq.com'] helper = TargetHelper() while True: source = helper.getSource() data = helper.getData(source) content = helper.getContent(data) if helper.tocheck(content): if MainHelper().send_mail(mail_to_list, u"xxx更新微博了", content): print (u"发送成功") else: print ("发送失败") else: pass time.sleep(30)
posted on 2018-10-05 18:00 myworldworld 阅读(575) 评论(0) 收藏 举报