Dbass

导航

利用正则+requests爬取猫眼电影信息

 1 import json
 2 # from multiprocessing import Pool
 3 import requests
 4 from requests.exceptions import RequestException
 5 import re
 6 
 7 
 8 def get_one_page(url):
 9     try:
10         headers={"user-agent":'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
11         response = requests.get(url,headers=headers)
12         if response.status_code == 200:
13             return response.text
14         return None
15     except RequestException:
16         return None
17 
18 def parse_one_page(html):
19     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
20                          +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
21                          +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
22     items = re.findall(pattern, html)
23     for item in items:
24         yield {
25             'index': item[0],
26             'image': item[1],
27             'title': item[2],
28             'actor': item[3].strip()[3:],
29             'time': item[4].strip()[5:],
30             'score': item[5]+item[6]
31         }
32 
33 def write_to_file(content):
34     with open('result.txt', 'a', encoding='utf-8') as f:
35         f.write(json.dumps(content, ensure_ascii=False) + '\n')
36         f.close()
37 
38 def main(offset):
39     url = 'http://maoyan.com/board/4?offset='+str(offset)
40     # url='http://www.baidu.com'
41     html = get_one_page(url)
42     # print(html)
43     for item in parse_one_page(html):
44         print(item)
45         write_to_file(item)
46 
47 if __name__ == '__main__':
48      for i in range(10):
49           main(i*10)

注:需要重置requests的headers,否则猫眼电影拒绝访问。

posted on 2017-11-30 16:45  Dbass  阅读(862)  评论(0编辑  收藏  举报