多线程爬取猫眼电影TOP100并保存到mongo数据库中

 1 import requests
 2 import re
 3 import json
 4 from requests.exceptions import RequestException
 5 from multiprocessing import Pool
 6 
 7 # 获取网页
 8 def get_one_page(url):
 9     headers = {
10         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'+
11                      'Chrome/63.0.3239.132 Safari/537.36'}
12     try:
13         resp = requests.get(url,headers=headers)
14         if resp.status_code == 200:
15             return resp.text
16         return None
17     except RequestException:
18         return None
19 # 解析网页
20 def parse_one_page(html):
21     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name"><a'
22                          +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
23                          +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
24     items = re.findall(pattern,html)
25     for item in items:
26         yield{
27             'index':item[0],
28             'title':item[1],
29             'actor':item[2].strip(),
30             'time':item[3],
31             'score':item[4]+item[5],
32             # 'image': item[6],
33         }
34 # 保存数据
35 def write_to_file(content):
36     with open('TOP1OO.txt','a',encoding='utf-8') as f:
37         f.write(json.dumps(content,ensure_ascii=False)+'\n')
38 
39 def main(offset):
40     url = 'http://maoyan.com/board/4?offset='+str(offset)
41     html = get_one_page(url)
42     for item in parse_one_page(html):
43         write_to_file(item)
44 
45 if __name__ == '__main__':
46     # for i in range(10):
47     #     main(i*10)
48     pool = Pool()
49     pool.map(main,[i*10 for i in range(10)])

 

posted @ 2018-07-27 10:04  Ray_chen  阅读(226)  评论(0)    收藏  举报