"""
请求连接:https://maoyan.com/board/4
第二页:https://maoyan.com/board/4?offset=10
"""
import requests
import re
class myspider():
def __init__(self,base_url,headers):
self.base_url = base_url
self.headers = headers
#获取第一页数据
def get_data(self,start_num):
url = self.base_url.format(start_num)
response = requests.get(url = url,headers = self.headers)
#判断状态码
if response.status_code == 200:
return response.content.decode('utf8')
else:
return None
#解析数据
def parse_onepage(self,html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+).*?movie-item-info.*?>.*?<a.*?title="(.*?)".*?>.*?</dd>',re.S)
result = re.findall(pattern,html)
return result
#保存数据
def save_data(self,data):
for value in data:
list1 = []
for valuedate in value:
list1.append(valuedate)
#列表拼接成字符串
movestr = " ".join(list1)+'\n'
with open('./movestr.txt','a',encoding='utf-8') as f:
f.write(movestr)
if __name__ == "__main__":
#连接参数
base_url = "https://maoyan.com/board/4?offset={}"
#请求头
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
my_spider = myspider(base_url, headers)
html = my_spider.get_data(0)
value = my_spider.parse_onepage(html)
my_spider.save_data(value)