利用正则+requests爬取猫眼电影信息
1 import json 2 # from multiprocessing import Pool 3 import requests 4 from requests.exceptions import RequestException 5 import re 6 7 8 def get_one_page(url): 9 try: 10 headers={"user-agent":'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} 11 response = requests.get(url,headers=headers) 12 if response.status_code == 200: 13 return response.text 14 return None 15 except RequestException: 16 return None 17 18 def parse_one_page(html): 19 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' 20 +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 21 +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) 22 items = re.findall(pattern, html) 23 for item in items: 24 yield { 25 'index': item[0], 26 'image': item[1], 27 'title': item[2], 28 'actor': item[3].strip()[3:], 29 'time': item[4].strip()[5:], 30 'score': item[5]+item[6] 31 } 32 33 def write_to_file(content): 34 with open('result.txt', 'a', encoding='utf-8') as f: 35 f.write(json.dumps(content, ensure_ascii=False) + '\n') 36 f.close() 37 38 def main(offset): 39 url = 'http://maoyan.com/board/4?offset='+str(offset) 40 # url='http://www.baidu.com' 41 html = get_one_page(url) 42 # print(html) 43 for item in parse_one_page(html): 44 print(item) 45 write_to_file(item) 46 47 if __name__ == '__main__': 48 for i in range(10): 49 main(i*10)
注:需要重置requests的headers,否则猫眼电影拒绝访问。