python 爬虫豆瓣top250

 

网页api:https://movie.douban.com/top250?start=0&filter=
用到的模块:urllib,re,csv 

捣鼓一上午终于好了,有些小问题

(top218有bug)具体问题:上图没有主演:用到正则表达式时取出过多的值,下图则是正常取值

所以取前200名,具体python代码实现如下,望大佬指导

#! /usr/bin/python3
# -*- coding:UTF-8 -*-
from urllib import request
import re,csv

class MovieTopForDouBan(object):
    def __init__(self):
        self.start = 0
        self.param = '&filter='
        self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                                   '(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
        self.file_path = 'D:\\'
        self.head = ['排名','名称','别名','其他名称','导演','主演','年份','地区','类型','平均分','人数','短评']
        self.movie_list=[]

    def get_page(self):
        try:
            url = 'https://movie.douban.com/top250?start=' + str(self.start)
            req = request.Request(url, headers=self.headers)
            response = request.urlopen(req)
            page = response.read().decode('utf-8')
            page_num = (self.start + 25) // 25
            print('正在抓取第' + str(page_num) + '页数据...')
            self.start += 25
            return page
        except request.URLError as e:
            if hasattr(e, 'reason'):
                print('抓取失败,失败原因:', e.reason)

    def get_movie_info(self):
        pattern = re.compile(u'<div.*?class="item">.*?<em class="">(.*?)</em>'
                             u'.*?<span.*?class="title">(.*?)</span>'
                             u'.*?<span.*?class="title">(.*?)</span>'
                             u'.*?<span.*?class="other">(.*?)</span>'
                             u'.*?<div.*?class="bd">.*?<p.*?class="">'
                             u'.*?导演:(.*?)&nbsp;.*?主演: (.*?)<br>'
                             u'(.*?)&nbsp;/&nbsp;(.*?)&nbsp;/&nbsp;(.*?)</p>.*?<div.*?class="star">'
                             u'.*?<span.*?class="rating_num".*?property="v:average">(.*?)</span>'
                             u'.*?<span>(.*?)人评价</span>.*?</div>'
                             u'.*?<span.*?class="inq">(.*?)</span>.*?</p>', re.S)
        while self.start <= 176:#取前俩百 (top:218 电影名:初恋这件小事)有bug
            page=self.d=self.get_page()
            movies=re.findall(pattern,page)
            for movie in movies:
                data =list(movie)
                data[2] = data[2].lstrip('&nbsp;/&nbsp;')
                data[3] = data[3].lstrip('&nbsp;/&nbsp;')
                data[6] = data[6].lstrip()
                data[8] = data[8].rstrip()
                self.movie_list.append(data)

    def write_text(self):
        print('开始向文件写入数据....')
        with open(self.file_path+'movie_info.txt','w',encoding='utf-8') as file_TopText:
            try:
                for movie in self.movie_list:
                    file_TopText.write('电影排名:' + movie[0] + '\r\n')
                    file_TopText.write('电影名称:' + movie[1] + '\r\n')
                    file_TopText.write('外文名称:' + movie[2] + '\r\n')
                    file_TopText.write('电影别名:' + movie[3] + '\r\n')
                    file_TopText.write('导演姓名:' + movie[4] + '\r\n')
                    file_TopText.write('主演姓名:' + movie[5] + '\r\n')
                    file_TopText.write('上映年份:' + movie[6] + '\r\n')
                    file_TopText.write('制作国家/地区:' + movie[7] + '\r\n')
                    file_TopText.write('电影类别:' + movie[8] + '\r\n')
                    file_TopText.write('电影评分:' + movie[9] + '\r\n')
                    file_TopText.write('参评人数:' + movie[10] + '\r\n')
                    file_TopText.write('简短影评:' + movie[11] + '\r\n\r\n')
                print('抓取结果写入文件成功...')
            except Exception as e:
                 print(e)
        print('数据写入完毕....')

    def write_csv_file(self):
        path = self.file_path + 'movie_info.csv'
        common=0
        try:
            with open(path, 'w', newline='',encoding='utf-8') as csv_file:
                writer = csv.writer(csv_file, dialect='excel')
                if self.head is not None:
                    writer.writerow(self.head)
                for row in self.movie_list:
                    writer.writerow(row)
                    common+=1
                print("将CSV文件写入路径%s成功。" % path)
        except Exception as e:
            print("将CSV文件写入路径: %s, 信息: %s" % (path, e))
            print(common)

    def main(self):
        print('开始从豆瓣电影抓取数据........')
        self.get_movie_info()
        self.write_text()
        #self.write_csv_file()
        print('数据抓取完毕...')

if __name__ == '__main__': 
    movie = MovieTopForDouBan()
    movie.main()

d盘根目录生成一个movie_info.txt 文件

 

posted @ 2018-06-12 13:29  晚安online  阅读(269)  评论(0编辑  收藏  举报