利用cookies+requests包登陆微博,使用xpath抓取目标用户的用户信息、微博以及对应评论
本文目的:介绍如何抓取微博内容,利用requests包+cookies实现登陆微博,lxml包的xpath语法解析网页,抓取目标内容。
所需python包:requests、lxml 皆使用pip安装即可
XPath即为XML路径语言,它是一种用来确定XML(标准通用标记语言的子集)文档中某部分位置的语言。XPath基于XML的树状结构,提供在数据结构树中找寻节点的能力。 xPath 同样也支持HTML.
XPath 是一门小型的查询语言,这里我们将它与 python 爬虫相结合来介绍。
xpath使用方法简介:
step1: 安装 lxml 库。
step2: from lxml import etree
step3: Selector = etree.HTML(网页源代码)
step4: Selector.xpath(一段神奇的符号)
具体的xpath解析方法大家可以参考之后下边的代码或者参考这篇博文http://cighao.com/2016/03/01/introduction-of-xPath/
言归正传,接下来从头看一下抓取的思路,首先找到微博的网址,为了方便抓取,准备抓取3g版微博,网址为 https://weibo.cn/,进而找到登陆的网址为 https://passport.weibo.cn/signin/login,按F12调出开发者工具,点击最上边一栏的network,然后输入自己的账号密码进行登录,登录之后可以看到弹出了很多元素,找到m.weibo.cn这一项点击,如图所示,里面三个红色框就是我们需要的

Request URL 代表了我们刚刚请求的网址
cookies代表了刚刚登录我们的账号里面包含的密码信息,待会登录会用到
user-Agent代表了我们使用的浏览器的头部信息,用来对我们的爬虫进行伪装,降低被封禁的风险
接下来上一段代码
#coding=utf-8 import re import time import string import os import pickleimport requests from lxml import etree import traceback import sys reload(sys) sys.setdefaultencoding('utf-8') class weibo: cookie = {"Cookie": "~~~~~~~~~~"} #将your cookie替换成自己的cookie header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}#这里就是浏览器的头部信息 def __init__(self,user_id,filter = 0): self.user_id = user_id #用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400 self.filter = filter #取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.userName = '' #用户名,如“Dear-迪丽热巴” self.weiboNum = 0 #用户全部微博数 self.weiboNum2 = 0 #爬取到的微博数 self.following = 0 #用户关注数 self.followers = 0 #用户粉丝数 self.weibos = [] #微博内容 self.num_zan = [] #微博对应的点赞数 self.num_forwarding = [] #微博对应的转发数 self.num_comment = [] #微博对应的评论数 self.weibo_detail_urls=[]#pickle.load(open("weibourl1.pkl", "r"))#微博评论 self.weibourls=[]#pickle.load(open('weibourl2.pkl','r'))#每一条微博链接,用于断点续爬 def getUserName(self): try: url = 'http://weibo.cn/%d/info'%(self.user_id) html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content #这一句里面的cookies与headers就实现了登录 selector = etree.HTML(html) userName = selector.xpath("//title/text()")[0] self.userName = userName[:-3] #print '用户昵称:' + self.userName except Exceptione,e: print "Error: ",e traceback.print_exc()
上边的代码就实现了利用requests包+cookies登录微博,接下来就是找到想要抓取的用户id,然后使用xpath解析网页进行信息抓取,代码如下:
def getUserInfo(self): try: url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter) html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content selector = etree.HTML(html) pattern = r"\d+\.?\d*" str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()") guid = re.findall(pattern, str_wb[0], re.S|re.M) for value in guid: num_wb = int(value) break self.weiboNum = num_wb print '微博数: ' + str(self.weiboNum) str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] guid = re.findall(pattern, str_gz, re.M) self.following = int(guid[0]) print '关注数: ' + str(self.following) str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1] guid = re.findall(pattern, str_fs, re.M) self.followers = int(guid[0]) print '粉丝数: ' + str(self.followers) except Exception,e: print "Error: ",e traceback.print_exc() def getWeiboInfo(self): try: url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter) html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content selector = etree.HTML(html) if selector.xpath('//input[@name="mp"]')==[]: pageNum = 1 else: pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value']) pattern = r"\d+\.?\d*" f = open("./%s.txt"%self.user_id, "wb") for page in range(1,pageNum+1): if page % 10 == 0: print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.') time.sleep(60*5) url2 = 'http://weibo.cn/u/%d?filter=%d&page=%d'%(self.user_id,self.filter,page) html2 = requests.get(url2, cookies = weibo.cookie,headers=weibo.header).content selector2 = etree.HTML(html2) info = selector2.xpath("//div[@class='c']") if len(info) > 3: for i in range(0,len(info)-2): detail = info[i].xpath("@id")[0] url3='http://weibo.cn/comment/{}?uid={}&rl=0'.format(detail.split('_')[-1], self.user_id) if url3 not in self.weibo_detail_urls: self.weiboNum2 = self.weiboNum2 + 1 #print self.weibo_detail_urls str_t = info[i].xpath("div/span[@class='ctt']") weibos = str_t[0].xpath('string(.)') self.weibos.append(weibos) #print '微博内容:'+ weibos+'***'+'No.%s'%self.weiboNum2 str_zan = info[i].xpath("div/a/text()")[-4] guid = re.findall(pattern, str_zan, re.M) num_zan = int(guid[0]) self.num_zan.append(num_zan) #print '点赞数: ' + str(num_zan) forwarding = info[i].xpath("div/a/text()")[-3] guid = re.findall(pattern, forwarding, re.M) num_forwarding = int(guid[0]) self.num_forwarding.append(num_forwarding) #print '转发数: ' + str(num_forwarding) comment = info[i].xpath("div/a/text()")[-2] guid = re.findall(pattern, comment, re.M) num_comment = int(guid[0]) self.num_comment.append(num_comment) #print '评论数: ' + str(num_comment) self.weibo_detail_urls.append(url3) text=str(self.weiboNum2)+':'+weibos + '\t'+'点赞数:' + str(num_zan) + '\t'+ ' 转发数:' + str(num_forwarding) +'\t'+ ' 评论数:' + str(num_comment) + '\n' f.write(text) pickle.dump(self.weibo_detail_urls, open("weibourl1.pkl", "w")) else: print url3+'这条微博已经爬取过,忽略' if self.filter == 0: print '共'+str(self.weiboNum2)+'条微博' else: print '共'+str(self.weiboNum)+'条微博,其中'+str(self.weiboNum2)+'条为原创微博' except Exception,e: print "Error: ",e traceback.print_exc() def get_weibo_detail_comment(self): weibo_comments_save_path = './weibo/{}.txt'.format(self.user_id) with open(weibo_comments_save_path, 'a') as f: for i, url in enumerate(self.weibo_detail_urls): if url not in self.weibourls: self.weibourls.append(url) pickle.dump(self.weibourls, open("weibourl2.pkl", "w")) print('solving weibo detail from {}'.format(url)) html_detail = requests.get(url, cookies=weibo.cookie, headers=weibo.header).content selector = etree.HTML(html_detail) str1='id="pagelist"' if str1 in html_detail: all_comment_pages = selector.xpath('//*[@id="pagelist"]/form/div/input[1]/@value')[0] else: all_comment_pages = 1 print('\n这是 {} 的微博:'.format(self.userName)) #print('微博内容: {}'.format(self.weibos[i])) #print('接下来是下面的评论:\n\n') f.writelines('微博内容: {}'.format(self.weibos[i])+'\n') f.writelines('接下来是下面的评论:\n') for page in range(1,int(all_comment_pages) + 1): if page % 10 == 0: print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.') time.sleep(60*5) detail_comment_url = url + '&page=' + str(page) try: html_detail_page = requests.get(detail_comment_url, cookies=weibo.cookie,headers=weibo.header).content selector = etree.HTML(html_detail_page) comment_div_element = selector.xpath('//div[starts-with(@id, "C_")]') for child in comment_div_element: single_comment_user_name = child.xpath('a[1]/text()')[0] if child.xpath('span[1][count(*)=0]'): single_comment_content = child.xpath('span[1][count(*)=0]/text()')[0] else: span_element = child.xpath('span[1]')[0] at_user_name = span_element.xpath('a/text()')[0] at_user_name = '$' + at_user_name.split('@')[-1] + '$' single_comment_content = span_element.xpath('/text()') single_comment_content.insert(1, at_user_name) single_comment_content = ' '.join(single_comment_content) full_single_comment = '<' + single_comment_user_name + '>' + ': ' + single_comment_content #print(full_single_comment) f.writelines(full_single_comment + '\n') #f.writelines('F\n') except etree.XMLSyntaxError as e: print('user id {} all done!'.format(self.user_id)) print('all weibo content and comments saved into {}'.format(weibo_comments_save_path)) f.writelines('F\n') else: print 'has alredy' def writeTxt(self): try: if self.filter == 1: resultHeader = '\n\n原创微博内容:\n' else: resultHeader = '\n\n微博内容:\n' result = '用户信息\n用户昵称:' + self.userName + '\n用户id:' + str(self.user_id) + '\n微博数:' + str(self.weiboNum) + '\n关注数:' + str(self.following) + '\n粉丝数:' + str(self.followers) + resultHeader if os.path.isdir('weibo') == False: os.mkdir('weibo') f = open("./%s.txt"%self.user_id, "wb") f.write(result) f.close() except Exception,e: print "Error: ",e traceback.print_exc() def start(self): try: weibo.getUserName(self) weibo.getUserInfo(self) weibo.writeTxt(self) weibo.getWeiboInfo(self) weibo.get_weibo_detail_comment(self) print '信息抓取完毕' print '===========================================================================' except Exception,e: print "Error: ",e user_id =~~~~~~~~~~~~~ #可以改成任意合法的用户id(爬虫的微博id除外) filter = 0 #值为0表示爬取全部的微博信息(原创微博+转发微博),值为1表示只爬取原创微博 #open('./weibourl1.pkl','w') #open('./weibourl2.pkl','w') wb = weibo(user_id,filter) #调用weibo类,创建微博实例wb wb.start() #爬取微博信息 print '用户名:' + wb.userName print '全部微博数:' + str(wb.weiboNum) print '关注数:' + str(wb.following) print '粉丝数:' + str(wb.followers) print '最新一条微博为:' + wb.weibos[0] #若filter=1则为最新的原创微博,如果该用户微博数为0,即len(wb.weibos)==0,打印会出错,下同 print '最新一条微博获得的点赞数:' + str(wb.num_zan[0]) print '最新一条微博获得的转发数:' + str(wb.num_forwarding[0]) print '最新一条微博获得的评论数:' + str(wb.num_comment[0]) wb.writeTxt() #wb.writeTxt()只是把信息写到文件里,大家可以根据自己的需要重新编写writeTxt()函数
这两段代码就实现了对微博用户的用户名、微博数、微博转发、评论、点赞数、以及微博及对微博评论的抓取。
参考博文:http://cighao.com/2016/03/01/introduction-of-xPath/

浙公网安备 33010602011771号