1 import requests
2 import re
3 import json
4 from requests.exceptions import RequestException
5 from multiprocessing import Pool
6
7 # 获取网页
8 def get_one_page(url):
9 headers = {
10 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'+
11 'Chrome/63.0.3239.132 Safari/537.36'}
12 try:
13 resp = requests.get(url,headers=headers)
14 if resp.status_code == 200:
15 return resp.text
16 return None
17 except RequestException:
18 return None
19 # 解析网页
20 def parse_one_page(html):
21 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name"><a'
22 +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
23 +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
24 items = re.findall(pattern,html)
25 for item in items:
26 yield{
27 'index':item[0],
28 'title':item[1],
29 'actor':item[2].strip(),
30 'time':item[3],
31 'score':item[4]+item[5],
32 # 'image': item[6],
33 }
34 # 保存数据
35 def write_to_file(content):
36 with open('TOP1OO.txt','a',encoding='utf-8') as f:
37 f.write(json.dumps(content,ensure_ascii=False)+'\n')
38
39 def main(offset):
40 url = 'http://maoyan.com/board/4?offset='+str(offset)
41 html = get_one_page(url)
42 for item in parse_one_page(html):
43 write_to_file(item)
44
45 if __name__ == '__main__':
46 # for i in range(10):
47 # main(i*10)
48 pool = Pool()
49 pool.map(main,[i*10 for i in range(10)])