import re
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException
def get_one_page(url):
"""
获取单页面信息
:param url:
:return:
"""
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
"""
解析页面信息
:param html:
:return:
"""
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?poster-default.*?src="(.*?)"'
'.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)'
'</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
"index": item[0],
"image": item[1],
"title": item[2],
"star": item[3].strip()[3:],
"time": item[4].strip()[5:],
"score": item[5]+item[6]
}
def save_to_file(content):
"""
将信息保存到文件中
:param content:
:return:
"""
with open("maoyan.txt", "a", encoding="utf-8") as f:
f.write(json.dumps(content, ensure_ascii=False) + "\n")
def main(offset):
url = "https://maoyan.com/board/4?offset={}".format(offset)
html = get_one_page(url)
for item in parse_one_page(html):
save_to_file(item)
if __name__ == "__main__":
# for i in range(10):
# main(i*10)
# 使用多进程请求多个url来减少网络等待浪费的时间
# map默认异步执行任务、自带close和join功能
pool = Pool()
pool.map(main, [i*10 for i in range(10)])