import json
import re
import requests
from requests import RequestException
from multiprocessing import Pool #引入进程池
def get_page(url):#获取网页
try:
headers={'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'}
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):#正则表达式提取
pattern=re.compile('<dd>.*?board-index.*?">(\d+?)</i>.*?title="(.*?)".*?<img.*?img.*?src='
'"(.*?)".*?>.*?"star">(.*?)</p>.*?"releasetime">(.*?)</p>.*?integer">(.*?)'
'</i>.*?fraction">(\d+?)</i>.*?</dd>',re.S)
items=re.findall(pattern,html)
for item in items:
yield {
'排名:':item[0],
'电影:':item[1],
'图片:':item[2],
'主演:':item[3].strip()[3:],
'上映时间:':item[4].strip()[5:],
'评分:':item[5]+item[6]
}
def write_file(content):#写入文件
with open('movie.txt','a',encoding='utf-8') as f:#以utf-8编码新建
f.write(json.dumps(content,ensure_ascii=False)+'\n')#不转为unicode编码
def main(page):
url='http://maoyan.com/board/4'
html=get_page(url=url+'?offset='+str(page*10))
# print(html)
items=parse_one_page(html)
for item in items:
write_file(item)
if __name__ == '__main__':
#单进程
'''
for i in range(10):
main(i)
'''
#多进程
pool=Pool()
pool.map(main,[i for i in range(10)])