爬取豆瓣影评1--寻找json格式的电影信息

- 首先找到这个网页https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0

- 然后F12在network下找到这个内容页,打开后发现参数可调,所以电影数量和ID可以爬取

 

设计代码:

  

def askUrl(url):
    head={
        # 'Host': 'movie.douban.com',
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reasen)
    return html


#爬取网页信息
def get_info(baseurl):
    html = askUrl(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    return bs
#soup处理并转换成字符串
def transport(bs, info):
    ex_info = bs.find_all(class_=info)
    info = str(ex_info)
    return ex_info, info

'''
    TODO:获取豆瓣电影ID
'''
if __name__ == '__main__':
    #获取一千个电影ID
    url= 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=10&page_start=0'
    res = get_info(url)
    response_data = json.loads(res.text)
    # 制作对应电影的评论链接
    highComment = []
    middleComment = []
    lowComment = []
    for k in response_data['subjects']:
        id = k['id']
        highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&start=20&limit=20&status=P&sort=new_score"%(id)
        print(highUrl)

 

  1. 大量获得豆瓣电影ID 用来制作其影评的链接

    - 首先找到这个网页https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0
    - 然后F12在network下找到这个内容页,打开后发现参数可调,所以电影数量和ID可以爬取
    • image-20211120135607718

    • 设计代码,爬取2000个电影的ID信息。

    • 代码如下

    • def askUrl(url):
         head={
             # 'Host': 'movie.douban.com',
             # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
             # "Connection": "keep-alive",
             # "Cache-Control": "max-age = 0",
             # "Accept-Language": "zh - CN, zh;q = 0.9",
             # "Accept-Encoding": "gzip, deflate, br",
             # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }

         request = urllib.request.Request(url, headers=head)
         html = ""
         try:
             response = urllib.request.urlopen(request)
             html = response.read().decode("utf-8")
         except urllib.error.URLError as e:
             if hasattr(e,"code"):
                 print(e.code)
             if hasattr(e,"reason"):
                 print(e.reasen)
         return html


      #爬取网页信息
      def get_info(baseurl):
         html = askUrl(baseurl)
         bs = BeautifulSoup(html, "html.parser")
         return bs
      #soup处理并转换成字符串
      def transport(bs, info):
         ex_info = bs.find_all(class_=info)
         info = str(ex_info)
         return ex_info, info

      '''
        TODO:获取豆瓣电影ID
      '''
      if __name__ == '__main__':
         #获取一千个电影ID
         url= 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=10&page_start=0'
         res = get_info(url)
         response_data = json.loads(res.text)
         # 制作对应电影的评论链接
         highComment = []
         middleComment = []
         lowComment = []
         for k in response_data['subjects']:
             id = k['id']
             highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&start=20&limit=20&status=P&sort=new_score"%(id)
             print(highUrl)
  2. 当我们获取电影对应的评论链接后,我们似乎被豆瓣发现了,如果不登录不让继续浏览了,所以我们要用python登录豆瓣

    • 首先我们找到登录页面

    • image-20211120143121253

    • 然后我们输入错误的用户名和密码

    • image-20211120143228018

    • 然后我们找到登录的接口

    • image-20211120143437321

    • 然后我们往下面拉,查看一下登陆需要的参数

    • image-20211120143542883

    • 这样,我们就可以开始写我们的登录代码了!

    • 用python模拟登录的方法有很多,例如下面这一种

    • s = requests.Session()
      if __name__ == '__main__':
         login_url = 'https://accounts.douban.com/j/mobile/login/basic'
         headers = {
             # 'Host': 'movie.douban.com',
             # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
             # "Connection": "keep-alive",
             # "Cache-Control": "max-age = 0",
             # "Accept-Language": "zh - CN, zh;q = 0.9",
             # "Accept-Encoding": "gzip, deflate, br",
             # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
         data = {
             'name': '15081644800',
             'password': 'Qwer1234!',
             'remember': 'false',
             'tc_app_id': '2044348370',
             'randstr':'',
             'ticket' : '',
             'ck': ''
        }
         try:
             r1 = s.get(url=login_url,headers=headers)
             r = s.post(url=login_url, headers=headers,data=data)
             r.raise_for_status()
           
         except:
             print('爬取失败')
           
  3. 但是这时候我们又发现了一个问题,那就是登录的时候往往需要一个图形验证码,使用代码可以解决这个问题,但是我们不妨转换思维,

    我们的目的是爬取影评,而不是写代码登录豆瓣,所以而豆瓣判断用户是否登录是通过响应头中有没有携带用户信息的token来判断的,所以我们

    不妨自己在豆瓣网页上登录一下,然后复制一下载cookie中的token信息,然后放在我们的响应头之中,这样我们就可以直接爬取我们需要的信息了

    • 首先登录好后来到下面这个页面找到我们的cookie

    • image-20211120164008625

    • 然后放到我们的代码之中就可以了

    • image-20211120164038941

4.接下来我们就可以正式爬虫了,我们的思路如下:

  • 获取json格式的电影信息,

  • 循环遍历每个电影并制作其好,中,差三个评价的链接同时爬取影评。

  • 放入文件

  • 这里里面还有一个需要注意的问题,当我们的快速的访问过多网页的时候,会被豆瓣怀疑是机器人,然后让你输入验证码自证清白,

为了防止爬虫因为这个终止,我们在每次访问链接之前先停顿两秒钟,这会让我们表现得更像人。

  • 完整代码如下:

     

    # -*-coding:utf-8-*-
    # @Time :2021/11/20 13:58
    # @Author:shuaichao
    # @File :.py
    # @Software: PyCharm
    import urllib.request
    from bs4 import BeautifulSoup  # 网页解析,获悉数据.231
    import urllib.request, urllib.error  # 制定URL,获取网页数据
    import time
    import os
    import requests
    from lxml import etree
    import json
    from urllib.request import Request
    from urllib.request import urlopen


    def askUrl(url):
       headers = {
           "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
      }
       request = urllib.request.Request(url, headers=headers)
       html = ""
       try:
           response = urllib.request.urlopen(request)
           html = response.read().decode("utf-8")
       except urllib.error.URLError as e:
           if hasattr(e, "code"):
               print(e.code)
           if hasattr(e, "reason"):
               print(e.reasen)
       return html


    # 爬取网页信息
    def get_info(baseurl):
       html = askUrl(baseurl)
       bs = BeautifulSoup(html, "html.parser")
       return bs


    # soup处理并转换成字符串
    def transport(bs, info):
       ex_info = bs.find_all(class_=info)
       info = str(ex_info)
       return ex_info, info


    def getImg(url, imgName):
       headers = {
           "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
      }
       try:
           req_timeout = 5
           req = Request(url=url, headers=headers)
           f = urlopen(req, None, req_timeout)
           pic = f.read()
           # pic= Request.get(url, timeout=10)
           imgPath = './imgs/%s.jpg' % (imgName)
           fp = open(imgPath, 'wb')
           fp.write(pic)
           fp.close()
       except Request.exceptions.ConnectionError:
           print(u'链接失败')  ##再写一个爬去豆瓣登录页面的代码,并调用上述所写的方法


    '''
      TODO:获取豆瓣电影ID
    '''
    if __name__ == '__main__':
       print("开始")
       headers = {
           "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
      }
       # 获取一千个电影ID
       # 热门类型的
       url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=1300&page_start=0'
       # 国产类型的
       url_guochan = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&page_limit=150&page_start=0'
       # 豆瓣高分
       url_douban='https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=300&page_start=0'
       # 美剧
       url_meiju='https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BE%8E%E5%89%A7&page_limit=300&page_start=0'

       res = get_info(url_meiju)
       response_data = json.loads(res.text)
       # 存放评论
       comment_high = []
       comment_middle = []
       comment_low = []
       try:
           for index, k in enumerate(response_data['subjects']):
               # if index <= 1000:
               #     print(index)
               #     continue
               # 存放评论
               comment_high = []
               comment_middle = []
               comment_low = []
               print(index)

               if index % 2 == 0:
                   time.sleep(5)
               id = k['id']
               highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&limit=20&status=P&sort=new_score" % (
                   id)
               middleUrl = "https://movie.douban.com/subject/%s/comments?percent_type=m&limit=20&status=P&sort=new_score" % (
                   id)
               lowUrl = "https://movie.douban.com/subject/%s/comments?percent_type=l&limit=20&status=P&sort=new_score" % (
                   id)
               print(highUrl)
               '''
                  获取高评价评论
              '''
               # 循环请求接口
               for i in range(0, 10):
                   time.sleep(2)
                   urlTmp = highUrl + "&start=" + str(i * 20)
                   re = requests.get(url=urlTmp, headers=headers).text
                   # 构造了一个XPath解析对象并对HTML文本进行自动修正
                   html = etree.HTML(re)
                   # XPath使用路径表达式来选取用户名
                   comment = html.xpath('//div[@class="comment"]')
                   print("开始好评")
                   for content in comment:
                       names = content.xpath('.//a[@class=""]')
                       grades = content.xpath('.//span[contains(@class,"rating")]')
                       texts = content.xpath('.//span[@class="short"]')
                       name = names[0].xpath('./text()')[0]
                       if len(grades) > 0:
                           grade = grades[0].xpath('./@class')[0][7:8] + '星'
                       else:
                           grade = '暂无评价'
                       text = texts[0].xpath('./text()')[0]
                       comment_high.append(text)
                       print(text)
                       print(len(comment_high))
               '''
                  获取中评价评论
              '''
               for i in range(0, 10):
                   time.sleep(2)
                   urlTmp = middleUrl + "&start=" + str(i * 20)
                   re = requests.get(url=urlTmp, headers=headers).text
                   # 构造了一个XPath解析对象并对HTML文本进行自动修正
                   html = etree.HTML(re)
                   # XPath使用路径表达式来选取用户名
                   print("开始中评")
                   comment = html.xpath('//div[@class="comment"]')
                   for content in comment:
                       names = content.xpath('.//a[@class=""]')
                       grades = content.xpath('.//span[contains(@class,"rating")]')
                       texts = content.xpath('.//span[@class="short"]')
                       name = names[0].xpath('./text()')[0]
                       if len(grades) > 0:
                           grade = grades[0].xpath('./@class')[0][7:8] + '星'
                       else:
                           grade = '暂无评价'
                       text = texts[0].xpath('./text()')[0]
                       print(text)
                       comment_middle.append(text)
                       print(len(comment_middle))

               '''
                  获取低评价评论
              '''

               for i in range(0, 10):
                   time.sleep(2)
                   urlTmp = lowUrl + "&start=" + str(i * 20)
                   re = requests.get(url=urlTmp, headers=headers).text
                   # 构造了一个XPath解析对象并对HTML文本进行自动修正
                   html = etree.HTML(re)
                   # XPath使用路径表达式来选取用户名
                   comment = html.xpath('//div[@class="comment"]')
                   print("开始差评")
                   for content in comment:
                       names = content.xpath('.//a[@class=""]')
                       grades = content.xpath('.//span[contains(@class,"rating")]')
                       texts = content.xpath('.//span[@class="short"]')
                       name = names[0].xpath('./text()')[0]
                       if len(grades) > 0:
                           grade = grades[0].xpath('./@class')[0][7:8] + '星'
                       else:
                           grade = '暂无评价'
                       text = texts[0].xpath('./text()')[0]
                       comment_low.append(text)
                       print(text)
                       print(len(comment_low))
                   # 文件夹不存在,则创建文件夹
               save_path = './douban'
               folder = os.path.exists(save_path)
               if not folder:
                   os.makedirs(save_path)
               print("开始写入文件")
               with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f:
                   for v in comment_high:
                       print(v)
                       f.write('%s high\n' % v)
               with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f:
                   for v in comment_middle:
                       print(v)
                       f.write('%s middle\n' % v)
               with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f:
                   for v in comment_low:
                       print(v)
                       f.write('%s low\n' % v)
       except:
           with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f:
               for v in comment_high:
                   print(v)
                   f.write('%s high\n' % v)
           with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f:
               for v in comment_middle:
                   print("写入文件")
                   f.write('%s middle\n' % v)
           with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f:
               for v in comment_low:
                   print("写入文件")
                   f.write('%s low\n' % v)

经过长时间的爬取,我们获得了类似这种格式的信息:

image-20211122102131740

image-20211122102147240

posted @ 2021-11-27 18:37  帅超007  阅读(3827)  评论(0编辑  收藏  举报