利用cookies+requests包登陆微博，使用xpath抓取目标用户的用户信息、微博以及对应评论

本文目的：介绍如何抓取微博内容，利用requests包+cookies实现登陆微博，lxml包的xpath语法解析网页，抓取目标内容。

所需python包：requests、lxml 皆使用pip安装即可　　

　　XPath即为XML路径语言，它是一种用来确定XML（标准通用标记语言的子集）文档中某部分位置的语言。XPath基于XML的树状结构，提供在数据结构树中找寻节点的能力。 xPath 同样也支持HTML.

　　XPath 是一门小型的查询语言，这里我们将它与 python 爬虫相结合来介绍。

　　　　xpath使用方法简介：

　　　　step1: 安装 lxml 库。

　　　　step2:　from lxml import etree

　　　　step3: Selector = etree.HTML(网页源代码)

　　　　step4: Selector.xpath(一段神奇的符号)

　　　　具体的xpath解析方法大家可以参考之后下边的代码或者参考这篇博文http://cighao.com/2016/03/01/introduction-of-xPath/

言归正传，接下来从头看一下抓取的思路，首先找到微博的网址，为了方便抓取，准备抓取3g版微博，网址为 https://weibo.cn/，进而找到登陆的网址为 https://passport.weibo.cn/signin/login，按F12调出开发者工具，点击最上边一栏的network，然后输入自己的账号密码进行登录，登录之后可以看到弹出了很多元素，找到m.weibo.cn这一项点击，如图所示，里面三个红色框就是我们需要的

Request URL 代表了我们刚刚请求的网址

cookies代表了刚刚登录我们的账号里面包含的密码信息，待会登录会用到

user-Agent代表了我们使用的浏览器的头部信息，用来对我们的爬虫进行伪装，降低被封禁的风险

接下来上一段代码

#coding=utf-8
import re
import time
import string
import os
import pickleimport requests
from lxml import etree
import traceback
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class weibo:
    cookie = {"Cookie": "~~~~~~~~~~"} #将your cookie替换成自己的cookie
    header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}#这里就是浏览器的头部信息
    def __init__(self,user_id,filter = 0):
        self.user_id = user_id #用户id，即需要我们输入的数字，如昵称为“Dear-迪丽热巴”的id为1669879400
        self.filter = filter #取值范围为0、1，程序默认值为0，代表要爬取用户的全部微博，1代表只爬取用户的原创微博
        self.userName = '' #用户名，如“Dear-迪丽热巴”
        self.weiboNum = 0 #用户全部微博数
        self.weiboNum2 = 0 #爬取到的微博数
        self.following = 0 #用户关注数
        self.followers = 0 #用户粉丝数
        self.weibos = [] #微博内容
        self.num_zan = [] #微博对应的点赞数
        self.num_forwarding = [] #微博对应的转发数
        self.num_comment = [] #微博对应的评论数
        self.weibo_detail_urls=[]#pickle.load(open("weibourl1.pkl", "r"))#微博评论
        self.weibourls=[]#pickle.load(open('weibourl2.pkl','r'))#每一条微博链接，用于断点续爬
    def getUserName(self):
        try:
            url = 'http://weibo.cn/%d/info'%(self.user_id)
            html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content  #这一句里面的cookies与headers就实现了登录
            selector = etree.HTML(html)
            userName = selector.xpath("//title/text()")[0]
            self.userName = userName[:-3]
            #print '用户昵称：' + self.userName
        except Exceptione,e:
            print "Error: ",e
            traceback.print_exc()

上边的代码就实现了利用requests包+cookies登录微博，接下来就是找到想要抓取的用户id，然后使用xpath解析网页进行信息抓取，代码如下：

    def getUserInfo(self):
        try:
            url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter)
            html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content
            selector = etree.HTML(html)
            pattern = r"\d+\.?\d*"
            str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")
            guid = re.findall(pattern, str_wb[0], re.S|re.M)
            for value in guid:
                num_wb = int(value)
                break
            self.weiboNum = num_wb
            print '微博数: ' + str(self.weiboNum)
            str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
            guid = re.findall(pattern, str_gz, re.M)
            self.following = int(guid[0])
            print '关注数: ' + str(self.following)
            str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
            guid = re.findall(pattern, str_fs, re.M)
            self.followers = int(guid[0])
            print '粉丝数: ' + str(self.followers)
        except Exception,e:
            print "Error: ",e
            traceback.print_exc()
    def getWeiboInfo(self):
        try:
            url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter)
            html = requests.get(url, cookies = weibo.cookie,headers=weibo.header).content
            selector = etree.HTML(html)
            if selector.xpath('//input[@name="mp"]')==[]:
                pageNum = 1
            else:
                pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])
            pattern = r"\d+\.?\d*"
            f = open("./%s.txt"%self.user_id, "wb")
            for page in range(1,pageNum+1):
                if page % 10 == 0:
                    print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')
                    time.sleep(60*5)
                url2 = 'http://weibo.cn/u/%d?filter=%d&page=%d'%(self.user_id,self.filter,page)
                html2 = requests.get(url2, cookies = weibo.cookie,headers=weibo.header).content
                selector2 = etree.HTML(html2)
                info = selector2.xpath("//div[@class='c']")
                if len(info) > 3:
                    for i in range(0,len(info)-2):
                        detail = info[i].xpath("@id")[0]
                        url3='http://weibo.cn/comment/{}?uid={}&rl=0'.format(detail.split('_')[-1], self.user_id)
                        if url3 not in self.weibo_detail_urls:
                            self.weiboNum2 = self.weiboNum2 + 1
                            #print self.weibo_detail_urls
                            str_t = info[i].xpath("div/span[@class='ctt']")
                            weibos = str_t[0].xpath('string(.)')
                            self.weibos.append(weibos)
                            #print '微博内容：'+ weibos+'***'+'No.%s'%self.weiboNum2
                            str_zan = info[i].xpath("div/a/text()")[-4]
                            guid = re.findall(pattern, str_zan, re.M)
                            num_zan = int(guid[0])
                            self.num_zan.append(num_zan)
                            #print '点赞数: ' + str(num_zan)
                            forwarding = info[i].xpath("div/a/text()")[-3]
                            guid = re.findall(pattern, forwarding, re.M)
                            num_forwarding = int(guid[0])
                            self.num_forwarding.append(num_forwarding)
                            #print '转发数: ' + str(num_forwarding)
                            comment = info[i].xpath("div/a/text()")[-2]
                            guid = re.findall(pattern, comment, re.M)
                            num_comment = int(guid[0])
                            self.num_comment.append(num_comment)
                            #print '评论数: ' + str(num_comment)
                            self.weibo_detail_urls.append(url3)
                            text=str(self.weiboNum2)+':'+weibos + '\t'+'点赞数：' + str(num_zan) + '\t'+ ' 转发数：' + str(num_forwarding) +'\t'+ ' 评论数：' + str(num_comment) + '\n'
                            f.write(text)
                            pickle.dump(self.weibo_detail_urls, open("weibourl1.pkl", "w"))
                        else:
                            print url3+'这条微博已经爬取过，忽略'
            if self.filter == 0:
                print '共'+str(self.weiboNum2)+'条微博'
            else:
                print '共'+str(self.weiboNum)+'条微博，其中'+str(self.weiboNum2)+'条为原创微博'
        except Exception,e:
            print "Error: ",e
            traceback.print_exc()
    def get_weibo_detail_comment(self):
        weibo_comments_save_path = './weibo/{}.txt'.format(self.user_id)
        with open(weibo_comments_save_path, 'a') as f:
            for i, url in enumerate(self.weibo_detail_urls):
                if url not in self.weibourls:
                    self.weibourls.append(url)
                    pickle.dump(self.weibourls, open("weibourl2.pkl", "w"))
                    print('solving weibo detail from {}'.format(url))
                    html_detail = requests.get(url, cookies=weibo.cookie, headers=weibo.header).content
                    selector = etree.HTML(html_detail)
                    str1='id="pagelist"'
                    if str1 in html_detail:
                        all_comment_pages = selector.xpath('//*[@id="pagelist"]/form/div/input[1]/@value')[0]
                    else:
                        all_comment_pages = 1
                    print('\n这是 {} 的微博：'.format(self.userName))
                    #print('微博内容： {}'.format(self.weibos[i]))
                    #print('接下来是下面的评论：\n\n')
                    f.writelines('微博内容： {}'.format(self.weibos[i])+'\n')
                    f.writelines('接下来是下面的评论:\n')
                    for page in range(1,int(all_comment_pages) + 1):
                        if page % 10 == 0:
                            print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')
                            time.sleep(60*5)
                        detail_comment_url = url + '&page=' + str(page)
                        try:
                            html_detail_page = requests.get(detail_comment_url, cookies=weibo.cookie,headers=weibo.header).content
                            selector = etree.HTML(html_detail_page)
                            comment_div_element = selector.xpath('//div[starts-with(@id, "C_")]')
                            for child in comment_div_element:
                                single_comment_user_name = child.xpath('a[1]/text()')[0]
                                if child.xpath('span[1][count(*)=0]'):
                                    single_comment_content = child.xpath('span[1][count(*)=0]/text()')[0]
                                else:
                                    span_element = child.xpath('span[1]')[0]
                                    at_user_name = span_element.xpath('a/text()')[0]
                                    at_user_name = '$' + at_user_name.split('@')[-1] + '$'
                                    single_comment_content = span_element.xpath('/text()')
                                    single_comment_content.insert(1, at_user_name)
                                    single_comment_content = ' '.join(single_comment_content)
                                full_single_comment = '<' + single_comment_user_name + '>' + ': ' + single_comment_content
                                #print(full_single_comment)
                                f.writelines(full_single_comment + '\n')
            #f.writelines('F\n')
                        except etree.XMLSyntaxError as e:
                            print('user id {} all done!'.format(self.user_id))
                            print('all weibo content and comments saved into {}'.format(weibo_comments_save_path))
                    f.writelines('F\n')
                else:
                    print 'has alredy'
    def writeTxt(self):
        try:
            if self.filter == 1:
                resultHeader = '\n\n原创微博内容：\n'
            else:
                resultHeader = '\n\n微博内容：\n'
            result = '用户信息\n用户昵称：' + self.userName + '\n用户id：' + str(self.user_id) + '\n微博数：' + str(self.weiboNum) + '\n关注数：' + str(self.following) + '\n粉丝数：' + str(self.followers) + resultHeader
            if os.path.isdir('weibo') == False:
                os.mkdir('weibo')
            f = open("./%s.txt"%self.user_id, "wb")
            f.write(result)
            f.close()
        except Exception,e:
            print "Error: ",e
            traceback.print_exc()
    def start(self):
        try:
            weibo.getUserName(self)
            weibo.getUserInfo(self)
            weibo.writeTxt(self)
            weibo.getWeiboInfo(self)
            weibo.get_weibo_detail_comment(self)
            print '信息抓取完毕'
            print '==========================================================================='
        except Exception,e:
            print "Error: ",e
user_id =~~~~~~~~~~~~~  #可以改成任意合法的用户id（爬虫的微博id除外）
filter = 0 #值为0表示爬取全部的微博信息（原创微博+转发微博），值为1表示只爬取原创微博
#open('./weibourl1.pkl','w')
#open('./weibourl2.pkl','w')
wb = weibo(user_id,filter) #调用weibo类，创建微博实例wb
wb.start() #爬取微博信息
print '用户名：' + wb.userName
print '全部微博数：' + str(wb.weiboNum)
print '关注数：' + str(wb.following)
print '粉丝数：' + str(wb.followers)
print '最新一条微博为：' + wb.weibos[0] #若filter=1则为最新的原创微博，如果该用户微博数为0，即len(wb.weibos)==0,打印会出错，下同
print '最新一条微博获得的点赞数：' + str(wb.num_zan[0])
print '最新一条微博获得的转发数：' + str(wb.num_forwarding[0])
print '最新一条微博获得的评论数：' + str(wb.num_comment[0])
wb.writeTxt() #wb.writeTxt()只是把信息写到文件里，大家可以根据自己的需要重新编写writeTxt()函数

这两段代码就实现了对微博用户的用户名、微博数、微博转发、评论、点赞数、以及微博及对微博评论的抓取。

参考博文：http://cighao.com/2016/03/01/introduction-of-xPath/

posted @ 2017-10-31 15:04 viczhang 阅读(1445) 评论(0) 收藏举报

刷新页面返回顶部

viczhang

利用cookies+requests包登陆微博，使用xpath抓取目标用户的用户信息、微博以及对应评论

公告