利用beautifulsoup爬取豆瓣电影top250,存储在mongodb

不多说了,上代码:

 1 from requests import request
 2 from bs4 import BeautifulSoup
 3 import re
 4 import pymongo
 5 
 6 
 7 
 8 class SpiderDouBan:
 9 
10 
11 
12     def __init__(self):
13         client = pymongo.MongoClient(host='localhost', port=27017)
14         db = client['spider_db']
15         self.collection = db['douban_movie_top250']
16 
17 
18 
19     def get_html(self, url):
20         '''
21         获取一页的html文本
22         :param url: 地址
23         :return:
24         '''
25         html = request('get', url).text
26         soup = BeautifulSoup(html, 'lxml')
27         return soup
28 
29 
30     def get_one_page(self, soup, order):
31         '''
32         获取某一页的内容
33         :param soup: soup实例化对象
34         :return:
35         '''
36         movie_names = [span.string for span in soup.find_all(name='span', attrs={'class': 'title'}) if not re.search('\/', span.string)]
37         movie_actors = [ re.sub('\n|\xa0', '', p.get_text().strip('" " |\n | \xa0')).split('/') for p in soup.find_all(name='p', attrs={'class': ''})]
38         movie_rates = [span.string for span in soup.find_all(name='span', attrs={'class': 'rating_num'})]
39         comment_num = [span_2.string for span in soup.find_all(attrs={'property': 'v:best'}) for span_2 in span.next_siblings if re.search('\w+', span_2.string)]
40         short_comments = [re.sub('', '', span.string) for span in soup.find_all(class_='inq')]
41         for index, name in enumerate(movie_names):
42             print(f'正在爬取第{order + index + 1}条数据...')
43             movie_info = {
44                           'order': f'No.{order + index + 1}',
45                           'movie_name': name,
46                           'movie_type': f'{re.findall("[0-9]+", movie_actors[index][-3])[0]}年/{movie_actors[index][-2]}/{movie_actors[index][-1]}',
47                           'movie_rate': f'{movie_rates[index][0]}分',
48                           'short_comment': f'{short_comments[index]}'
49                           }
50             self.collection.insert_one(movie_info)
51 
52 
53 
54     def main(self, url, order):
55         '''
56         主程序
57         :return:
58         '''
59         soup = self.get_html(url)
60         self.get_one_page(soup, order)
61 
62 
63 
64 
65 if __name__ == '__main__':
66     for offset in range(0, 250, 25):
67         order = offset
68         url = f'https://movie.douban.com/top250?start={str(offset)}'
69         SpiderDouBan().main(url, order)

运行结果:

MongoDB存储效果:

 

posted @ 2019-06-23 14:57  cnhkzyy  阅读(756)  评论(0编辑  收藏  举报